blob: 6cf5cb2a41fb384d6cab284b7b35d584e34b3908 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700207static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200321 }
322 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200323 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 }
325 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200326 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200327 }
328 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200329 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200330 }
331 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200332 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200333 }
334 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200335 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 }
337 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200338 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200339 }
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_OTHER;
341}
342
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300343/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000345Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000346PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000348#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 /* This is actually an illegal character, so it should
352 not be passed to unichr. */
353 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354#endif
355}
356
Victor Stinner910337b2011-10-03 03:20:16 +0200357#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200358int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100359_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200360{
361 PyASCIIObject *ascii;
362 unsigned int kind;
363
364 assert(PyUnicode_Check(op));
365
366 ascii = (PyASCIIObject *)op;
367 kind = ascii->state.kind;
368
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200370 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 assert(ascii->state.ready == 1);
372 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200375 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200376
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 if (ascii->state.compact == 1) {
378 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200379 assert(kind == PyUnicode_1BYTE_KIND
380 || kind == PyUnicode_2BYTE_KIND
381 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200384 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100385 }
386 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389 data = unicode->data.any;
390 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100391 assert(ascii->length == 0);
392 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 assert(ascii->state.compact == 0);
394 assert(ascii->state.ascii == 0);
395 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200397 assert(ascii->wstr != NULL);
398 assert(data == NULL);
399 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 }
401 else {
402 assert(kind == PyUnicode_1BYTE_KIND
403 || kind == PyUnicode_2BYTE_KIND
404 || kind == PyUnicode_4BYTE_KIND);
405 assert(ascii->state.compact == 0);
406 assert(ascii->state.ready == 1);
407 assert(data != NULL);
408 if (ascii->state.ascii) {
409 assert (compact->utf8 == data);
410 assert (compact->utf8_length == ascii->length);
411 }
412 else
413 assert (compact->utf8 != data);
414 }
415 }
416 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200417 if (
418#if SIZEOF_WCHAR_T == 2
419 kind == PyUnicode_2BYTE_KIND
420#else
421 kind == PyUnicode_4BYTE_KIND
422#endif
423 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 {
425 assert(ascii->wstr == data);
426 assert(compact->wstr_length == ascii->length);
427 } else
428 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200429 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200430
431 if (compact->utf8 == NULL)
432 assert(compact->utf8_length == 0);
433 if (ascii->wstr == NULL)
434 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200435 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200436 /* check that the best kind is used */
437 if (check_content && kind != PyUnicode_WCHAR_KIND)
438 {
439 Py_ssize_t i;
440 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200441 void *data;
442 Py_UCS4 ch;
443
444 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 for (i=0; i < ascii->length; i++)
446 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200447 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200448 if (ch > maxchar)
449 maxchar = ch;
450 }
451 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100452 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 255);
455 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200456 else
457 assert(maxchar < 128);
458 }
Victor Stinner77faf692011-11-20 18:56:05 +0100459 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200460 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100461 assert(maxchar <= 0xFFFF);
462 }
463 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100465 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200467 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200468 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400469 return 1;
470}
Victor Stinner910337b2011-10-03 03:20:16 +0200471#endif
472
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100473static PyObject*
474unicode_result_wchar(PyObject *unicode)
475{
476#ifndef Py_DEBUG
477 Py_ssize_t len;
478
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100479 len = _PyUnicode_WSTR_LENGTH(unicode);
480 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200482 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100483 }
484
485 if (len == 1) {
486 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100487 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489 Py_DECREF(unicode);
490 return latin1_char;
491 }
492 }
493
494 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200495 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 return NULL;
497 }
498#else
Victor Stinneraa771272012-10-04 02:32:58 +0200499 assert(Py_REFCNT(unicode) == 1);
500
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 /* don't make the result ready in debug mode to ensure that the caller
502 makes the string ready before using it */
503 assert(_PyUnicode_CheckConsistency(unicode, 1));
504#endif
505 return unicode;
506}
507
508static PyObject*
509unicode_result_ready(PyObject *unicode)
510{
511 Py_ssize_t length;
512
513 length = PyUnicode_GET_LENGTH(unicode);
514 if (length == 0) {
515 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100516 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200517 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 }
519 return unicode_empty;
520 }
521
522 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200523 void *data = PyUnicode_DATA(unicode);
524 int kind = PyUnicode_KIND(unicode);
525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 if (ch < 256) {
527 PyObject *latin1_char = unicode_latin1[ch];
528 if (latin1_char != NULL) {
529 if (unicode != latin1_char) {
530 Py_INCREF(latin1_char);
531 Py_DECREF(unicode);
532 }
533 return latin1_char;
534 }
535 else {
536 assert(_PyUnicode_CheckConsistency(unicode, 1));
537 Py_INCREF(unicode);
538 unicode_latin1[ch] = unicode;
539 return unicode;
540 }
541 }
542 }
543
544 assert(_PyUnicode_CheckConsistency(unicode, 1));
545 return unicode;
546}
547
548static PyObject*
549unicode_result(PyObject *unicode)
550{
551 assert(_PyUnicode_CHECK(unicode));
552 if (PyUnicode_IS_READY(unicode))
553 return unicode_result_ready(unicode);
554 else
555 return unicode_result_wchar(unicode);
556}
557
Victor Stinnerc4b49542011-12-11 22:44:26 +0100558static PyObject*
559unicode_result_unchanged(PyObject *unicode)
560{
561 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500562 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563 return NULL;
564 Py_INCREF(unicode);
565 return unicode;
566 }
567 else
568 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100569 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100570}
571
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573 ASCII, Latin1, UTF-8, etc. */
574static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200575backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577{
Victor Stinnerad771582015-10-09 12:38:53 +0200578 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200579 Py_UCS4 ch;
580 enum PyUnicode_Kind kind;
581 void *data;
582
583 assert(PyUnicode_IS_READY(unicode));
584 kind = PyUnicode_KIND(unicode);
585 data = PyUnicode_DATA(unicode);
586
587 size = 0;
588 /* determine replacement size */
589 for (i = collstart; i < collend; ++i) {
590 Py_ssize_t incr;
591
592 ch = PyUnicode_READ(kind, data, i);
593 if (ch < 0x100)
594 incr = 2+2;
595 else if (ch < 0x10000)
596 incr = 2+4;
597 else {
598 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200599 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200600 }
601 if (size > PY_SSIZE_T_MAX - incr) {
602 PyErr_SetString(PyExc_OverflowError,
603 "encoded result is too long for a Python string");
604 return NULL;
605 }
606 size += incr;
607 }
608
Victor Stinnerad771582015-10-09 12:38:53 +0200609 str = _PyBytesWriter_Prepare(writer, str, size);
610 if (str == NULL)
611 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200612
613 /* generate replacement */
614 for (i = collstart; i < collend; ++i) {
615 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200616 *str++ = '\\';
617 if (ch >= 0x00010000) {
618 *str++ = 'U';
619 *str++ = Py_hexdigits[(ch>>28)&0xf];
620 *str++ = Py_hexdigits[(ch>>24)&0xf];
621 *str++ = Py_hexdigits[(ch>>20)&0xf];
622 *str++ = Py_hexdigits[(ch>>16)&0xf];
623 *str++ = Py_hexdigits[(ch>>12)&0xf];
624 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 }
Victor Stinner797485e2015-10-09 03:17:30 +0200626 else if (ch >= 0x100) {
627 *str++ = 'u';
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
630 }
631 else
632 *str++ = 'x';
633 *str++ = Py_hexdigits[(ch>>4)&0xf];
634 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200635 }
636 return str;
637}
638
639/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640 ASCII, Latin1, UTF-8, etc. */
641static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200642xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644{
Victor Stinnerad771582015-10-09 12:38:53 +0200645 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 Py_UCS4 ch;
647 enum PyUnicode_Kind kind;
648 void *data;
649
650 assert(PyUnicode_IS_READY(unicode));
651 kind = PyUnicode_KIND(unicode);
652 data = PyUnicode_DATA(unicode);
653
654 size = 0;
655 /* determine replacement size */
656 for (i = collstart; i < collend; ++i) {
657 Py_ssize_t incr;
658
659 ch = PyUnicode_READ(kind, data, i);
660 if (ch < 10)
661 incr = 2+1+1;
662 else if (ch < 100)
663 incr = 2+2+1;
664 else if (ch < 1000)
665 incr = 2+3+1;
666 else if (ch < 10000)
667 incr = 2+4+1;
668 else if (ch < 100000)
669 incr = 2+5+1;
670 else if (ch < 1000000)
671 incr = 2+6+1;
672 else {
673 assert(ch <= MAX_UNICODE);
674 incr = 2+7+1;
675 }
676 if (size > PY_SSIZE_T_MAX - incr) {
677 PyErr_SetString(PyExc_OverflowError,
678 "encoded result is too long for a Python string");
679 return NULL;
680 }
681 size += incr;
682 }
683
Victor Stinnerad771582015-10-09 12:38:53 +0200684 str = _PyBytesWriter_Prepare(writer, str, size);
685 if (str == NULL)
686 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200687
688 /* generate replacement */
689 for (i = collstart; i < collend; ++i) {
690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691 }
692 return str;
693}
694
Thomas Wouters477c8d52006-05-27 19:21:47 +0000695/* --- Bloom Filters ----------------------------------------------------- */
696
697/* stuff to implement simple "bloom filters" for Unicode characters.
698 to keep things simple, we use a single bitmask, using the least 5
699 bits from each unicode characters as the bit index. */
700
701/* the linebreak mask is set up by Unicode_Init below */
702
Antoine Pitrouf068f942010-01-13 14:19:12 +0000703#if LONG_BIT >= 128
704#define BLOOM_WIDTH 128
705#elif LONG_BIT >= 64
706#define BLOOM_WIDTH 64
707#elif LONG_BIT >= 32
708#define BLOOM_WIDTH 32
709#else
710#error "LONG_BIT is smaller than 32"
711#endif
712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713#define BLOOM_MASK unsigned long
714
Serhiy Storchaka05997252013-01-26 12:14:02 +0200715static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000716
Antoine Pitrouf068f942010-01-13 14:19:12 +0000717#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718
Benjamin Peterson29060642009-01-31 22:14:21 +0000719#define BLOOM_LINEBREAK(ch) \
720 ((ch) < 128U ? ascii_linebreak[(ch)] : \
721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000722
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700723static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000725{
Victor Stinnera85af502013-04-09 21:53:54 +0200726#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
727 do { \
728 TYPE *data = (TYPE *)PTR; \
729 TYPE *end = data + LEN; \
730 Py_UCS4 ch; \
731 for (; data != end; data++) { \
732 ch = *data; \
733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734 } \
735 break; \
736 } while (0)
737
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738 /* calculate simple bloom-style bitmask for a given unicode string */
739
Antoine Pitrouf068f942010-01-13 14:19:12 +0000740 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741
742 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200743 switch (kind) {
744 case PyUnicode_1BYTE_KIND:
745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746 break;
747 case PyUnicode_2BYTE_KIND:
748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749 break;
750 case PyUnicode_4BYTE_KIND:
751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752 break;
753 default:
754 assert(0);
755 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000756 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200757
758#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759}
760
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300761static int
762ensure_unicode(PyObject *obj)
763{
764 if (!PyUnicode_Check(obj)) {
765 PyErr_Format(PyExc_TypeError,
766 "must be str, not %.100s",
767 Py_TYPE(obj)->tp_name);
768 return -1;
769 }
770 return PyUnicode_READY(obj);
771}
772
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773/* Compilation of templated routines */
774
775#include "stringlib/asciilib.h"
776#include "stringlib/fastsearch.h"
777#include "stringlib/partition.h"
778#include "stringlib/split.h"
779#include "stringlib/count.h"
780#include "stringlib/find.h"
781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs1lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300791#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs2lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs4lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200818#include "stringlib/unicodedefs.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100822#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824/* --- Unicode Object ----------------------------------------------------- */
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200827fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700829static inline Py_ssize_t
830findchar(const void *s, int kind,
831 Py_ssize_t size, Py_UCS4 ch,
832 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834 switch (kind) {
835 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200836 if ((Py_UCS1) ch != ch)
837 return -1;
838 if (direction > 0)
839 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
840 else
841 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200842 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200843 if ((Py_UCS2) ch != ch)
844 return -1;
845 if (direction > 0)
846 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
847 else
848 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200849 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200850 if (direction > 0)
851 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
852 else
853 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200854 default:
855 assert(0);
856 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858}
859
Victor Stinnerafffce42012-10-03 23:03:17 +0200860#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000861/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200862 earlier.
863
864 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
865 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
866 invalid character in Unicode 6.0. */
867static void
868unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
869{
870 int kind = PyUnicode_KIND(unicode);
871 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
872 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
873 if (length <= old_length)
874 return;
875 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
876}
877#endif
878
Victor Stinnerfe226c02011-10-03 03:52:20 +0200879static PyObject*
880resize_compact(PyObject *unicode, Py_ssize_t length)
881{
882 Py_ssize_t char_size;
883 Py_ssize_t struct_size;
884 Py_ssize_t new_size;
885 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100886 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200887#ifdef Py_DEBUG
888 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
889#endif
890
Victor Stinner79891572012-05-03 13:43:07 +0200891 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200892 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100893 assert(PyUnicode_IS_COMPACT(unicode));
894
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200895 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100896 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200897 struct_size = sizeof(PyASCIIObject);
898 else
899 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200900 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901
Victor Stinnerfe226c02011-10-03 03:52:20 +0200902 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
903 PyErr_NoMemory();
904 return NULL;
905 }
906 new_size = (struct_size + (length + 1) * char_size);
907
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200908 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
909 PyObject_DEL(_PyUnicode_UTF8(unicode));
910 _PyUnicode_UTF8(unicode) = NULL;
911 _PyUnicode_UTF8_LENGTH(unicode) = 0;
912 }
Victor Stinner84def372011-12-11 20:04:56 +0100913 _Py_DEC_REFTOTAL;
914 _Py_ForgetReference(unicode);
915
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300916 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100917 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100918 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200919 PyErr_NoMemory();
920 return NULL;
921 }
Victor Stinner84def372011-12-11 20:04:56 +0100922 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200923 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100924
Victor Stinnerfe226c02011-10-03 03:52:20 +0200925 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200926 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200927 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100928 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200929 _PyUnicode_WSTR_LENGTH(unicode) = length;
930 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100931 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
932 PyObject_DEL(_PyUnicode_WSTR(unicode));
933 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100934 if (!PyUnicode_IS_ASCII(unicode))
935 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100936 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200937#ifdef Py_DEBUG
938 unicode_fill_invalid(unicode, old_length);
939#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
941 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200942 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 return unicode;
944}
945
Alexander Belopolsky40018472011-02-26 01:02:56 +0000946static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200947resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948{
Victor Stinner95663112011-10-04 01:03:50 +0200949 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100950 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200952 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000953
Victor Stinnerfe226c02011-10-03 03:52:20 +0200954 if (PyUnicode_IS_READY(unicode)) {
955 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200956 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200958#ifdef Py_DEBUG
959 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
960#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961
962 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200963 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
965 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200966
967 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
968 PyErr_NoMemory();
969 return -1;
970 }
971 new_size = (length + 1) * char_size;
972
Victor Stinner7a9105a2011-12-12 00:13:42 +0100973 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
974 {
975 PyObject_DEL(_PyUnicode_UTF8(unicode));
976 _PyUnicode_UTF8(unicode) = NULL;
977 _PyUnicode_UTF8_LENGTH(unicode) = 0;
978 }
979
Victor Stinnerfe226c02011-10-03 03:52:20 +0200980 data = (PyObject *)PyObject_REALLOC(data, new_size);
981 if (data == NULL) {
982 PyErr_NoMemory();
983 return -1;
984 }
985 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200986 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200987 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200988 _PyUnicode_WSTR_LENGTH(unicode) = length;
989 }
990 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200991 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200992 _PyUnicode_UTF8_LENGTH(unicode) = length;
993 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 _PyUnicode_LENGTH(unicode) = length;
995 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 unicode_fill_invalid(unicode, old_length);
998#endif
Victor Stinner95663112011-10-04 01:03:50 +0200999 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001000 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 }
Victor Stinner95663112011-10-04 01:03:50 +02001004 assert(_PyUnicode_WSTR(unicode) != NULL);
1005
1006 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001007 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001008 PyErr_NoMemory();
1009 return -1;
1010 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001012 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001013 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001014 if (!wstr) {
1015 PyErr_NoMemory();
1016 return -1;
1017 }
1018 _PyUnicode_WSTR(unicode) = wstr;
1019 _PyUnicode_WSTR(unicode)[length] = 0;
1020 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001021 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022 return 0;
1023}
1024
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025static PyObject*
1026resize_copy(PyObject *unicode, Py_ssize_t length)
1027{
1028 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001031
Benjamin Petersonbac79492012-01-14 13:34:47 -05001032 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001033 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001034
1035 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1036 if (copy == NULL)
1037 return NULL;
1038
1039 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001040 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001042 }
1043 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001044 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001046 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 if (w == NULL)
1048 return NULL;
1049 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1050 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001051 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001052 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001053 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 }
1055}
1056
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001058 Ux0000 terminated; some code (e.g. new_identifier)
1059 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060
1061 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001062 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063
1064*/
1065
Alexander Belopolsky40018472011-02-26 01:02:56 +00001066static PyUnicodeObject *
1067_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001069 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001071
Thomas Wouters477c8d52006-05-27 19:21:47 +00001072 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073 if (length == 0 && unicode_empty != NULL) {
1074 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001075 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 }
1077
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001078 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001079 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001080 return (PyUnicodeObject *)PyErr_NoMemory();
1081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 if (length < 0) {
1083 PyErr_SetString(PyExc_SystemError,
1084 "Negative size passed to _PyUnicode_New");
1085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086 }
1087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1089 if (unicode == NULL)
1090 return NULL;
1091 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001092
1093 _PyUnicode_WSTR_LENGTH(unicode) = length;
1094 _PyUnicode_HASH(unicode) = -1;
1095 _PyUnicode_STATE(unicode).interned = 0;
1096 _PyUnicode_STATE(unicode).kind = 0;
1097 _PyUnicode_STATE(unicode).compact = 0;
1098 _PyUnicode_STATE(unicode).ready = 0;
1099 _PyUnicode_STATE(unicode).ascii = 0;
1100 _PyUnicode_DATA_ANY(unicode) = NULL;
1101 _PyUnicode_LENGTH(unicode) = 0;
1102 _PyUnicode_UTF8(unicode) = NULL;
1103 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1106 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001107 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001108 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001109 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001110 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111
Jeremy Hyltond8082792003-09-16 19:41:39 +00001112 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001113 * the caller fails before initializing str -- unicode_resize()
1114 * reads str[0], and the Keep-Alive optimization can keep memory
1115 * allocated for str alive across a call to unicode_dealloc(unicode).
1116 * We don't want unicode_resize to read uninitialized memory in
1117 * that case.
1118 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 _PyUnicode_WSTR(unicode)[0] = 0;
1120 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001121
Victor Stinner7931d9a2011-11-04 00:22:48 +01001122 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 return unicode;
1124}
1125
Victor Stinnerf42dc442011-10-02 23:33:16 +02001126static const char*
1127unicode_kind_name(PyObject *unicode)
1128{
Victor Stinner42dfd712011-10-03 14:41:45 +02001129 /* don't check consistency: unicode_kind_name() is called from
1130 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131 if (!PyUnicode_IS_COMPACT(unicode))
1132 {
1133 if (!PyUnicode_IS_READY(unicode))
1134 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001135 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001136 {
1137 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001138 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001139 return "legacy ascii";
1140 else
1141 return "legacy latin1";
1142 case PyUnicode_2BYTE_KIND:
1143 return "legacy UCS2";
1144 case PyUnicode_4BYTE_KIND:
1145 return "legacy UCS4";
1146 default:
1147 return "<legacy invalid kind>";
1148 }
1149 }
1150 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001151 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "ascii";
1155 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001156 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001157 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001158 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001159 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001160 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001161 default:
1162 return "<invalid compact kind>";
1163 }
1164}
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167/* Functions wrapping macros for use in debugger */
1168char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001169 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170}
1171
1172void *_PyUnicode_compact_data(void *unicode) {
1173 return _PyUnicode_COMPACT_DATA(unicode);
1174}
1175void *_PyUnicode_data(void *unicode){
1176 printf("obj %p\n", unicode);
1177 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1178 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1179 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1180 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1181 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1182 return PyUnicode_DATA(unicode);
1183}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001184
1185void
1186_PyUnicode_Dump(PyObject *op)
1187{
1188 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001189 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1190 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1191 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001192
Victor Stinnera849a4b2011-10-03 12:12:11 +02001193 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001194 {
1195 if (ascii->state.ascii)
1196 data = (ascii + 1);
1197 else
1198 data = (compact + 1);
1199 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001200 else
1201 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001202 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1203 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001204
Victor Stinnera849a4b2011-10-03 12:12:11 +02001205 if (ascii->wstr == data)
1206 printf("shared ");
1207 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001208
Victor Stinnera3b334d2011-10-03 13:53:37 +02001209 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001210 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001211 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1212 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001213 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1214 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001215 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001216 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001217}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218#endif
1219
1220PyObject *
1221PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1222{
1223 PyObject *obj;
1224 PyCompactUnicodeObject *unicode;
1225 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001226 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001227 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 Py_ssize_t char_size;
1229 Py_ssize_t struct_size;
1230
1231 /* Optimization for empty strings */
1232 if (size == 0 && unicode_empty != NULL) {
1233 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001234 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 }
1236
Victor Stinner9e9d6892011-10-04 01:02:02 +02001237 is_ascii = 0;
1238 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239 struct_size = sizeof(PyCompactUnicodeObject);
1240 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001241 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 char_size = 1;
1243 is_ascii = 1;
1244 struct_size = sizeof(PyASCIIObject);
1245 }
1246 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001247 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248 char_size = 1;
1249 }
1250 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001251 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 char_size = 2;
1253 if (sizeof(wchar_t) == 2)
1254 is_sharing = 1;
1255 }
1256 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001257 if (maxchar > MAX_UNICODE) {
1258 PyErr_SetString(PyExc_SystemError,
1259 "invalid maximum character passed to PyUnicode_New");
1260 return NULL;
1261 }
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 4;
1264 if (sizeof(wchar_t) == 4)
1265 is_sharing = 1;
1266 }
1267
1268 /* Ensure we won't overflow the size. */
1269 if (size < 0) {
1270 PyErr_SetString(PyExc_SystemError,
1271 "Negative size passed to PyUnicode_New");
1272 return NULL;
1273 }
1274 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1275 return PyErr_NoMemory();
1276
1277 /* Duplicated allocation code from _PyObject_New() instead of a call to
1278 * PyObject_New() so we are able to allocate space for the object and
1279 * it's data buffer.
1280 */
1281 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1282 if (obj == NULL)
1283 return PyErr_NoMemory();
1284 obj = PyObject_INIT(obj, &PyUnicode_Type);
1285 if (obj == NULL)
1286 return NULL;
1287
1288 unicode = (PyCompactUnicodeObject *)obj;
1289 if (is_ascii)
1290 data = ((PyASCIIObject*)obj) + 1;
1291 else
1292 data = unicode + 1;
1293 _PyUnicode_LENGTH(unicode) = size;
1294 _PyUnicode_HASH(unicode) = -1;
1295 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001296 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 _PyUnicode_STATE(unicode).compact = 1;
1298 _PyUnicode_STATE(unicode).ready = 1;
1299 _PyUnicode_STATE(unicode).ascii = is_ascii;
1300 if (is_ascii) {
1301 ((char*)data)[size] = 0;
1302 _PyUnicode_WSTR(unicode) = NULL;
1303 }
Victor Stinner8f825062012-04-27 13:55:39 +02001304 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 ((char*)data)[size] = 0;
1306 _PyUnicode_WSTR(unicode) = NULL;
1307 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001309 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 else {
1312 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001313 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001314 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001316 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 ((Py_UCS4*)data)[size] = 0;
1318 if (is_sharing) {
1319 _PyUnicode_WSTR_LENGTH(unicode) = size;
1320 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1321 }
1322 else {
1323 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1324 _PyUnicode_WSTR(unicode) = NULL;
1325 }
1326 }
Victor Stinner8f825062012-04-27 13:55:39 +02001327#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001328 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001329#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001330 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 return obj;
1332}
1333
1334#if SIZEOF_WCHAR_T == 2
1335/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1336 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001337 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338
1339 This function assumes that unicode can hold one more code point than wstr
1340 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001341static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001343 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344{
1345 const wchar_t *iter;
1346 Py_UCS4 *ucs4_out;
1347
Victor Stinner910337b2011-10-03 03:20:16 +02001348 assert(unicode != NULL);
1349 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1351 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1352
1353 for (iter = begin; iter < end; ) {
1354 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1355 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001356 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1357 && (iter+1) < end
1358 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 {
Victor Stinner551ac952011-11-29 22:58:13 +01001360 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 iter += 2;
1362 }
1363 else {
1364 *ucs4_out++ = *iter;
1365 iter++;
1366 }
1367 }
1368 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1369 _PyUnicode_GET_LENGTH(unicode)));
1370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371}
1372#endif
1373
Victor Stinnercd9950f2011-10-02 00:34:53 +02001374static int
Victor Stinner488fa492011-12-12 00:01:39 +01001375unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001376{
Victor Stinner488fa492011-12-12 00:01:39 +01001377 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001378 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001379 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001380 return -1;
1381 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001382 return 0;
1383}
1384
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001385static int
1386_copy_characters(PyObject *to, Py_ssize_t to_start,
1387 PyObject *from, Py_ssize_t from_start,
1388 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001390 unsigned int from_kind, to_kind;
1391 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392
Victor Stinneree4544c2012-05-09 22:24:08 +02001393 assert(0 <= how_many);
1394 assert(0 <= from_start);
1395 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001397 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001398 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399
Victor Stinnerd3f08822012-05-29 12:57:52 +02001400 assert(PyUnicode_Check(to));
1401 assert(PyUnicode_IS_READY(to));
1402 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1403
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001404 if (how_many == 0)
1405 return 0;
1406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001408 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001410 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411
Victor Stinnerf1852262012-06-16 16:38:26 +02001412#ifdef Py_DEBUG
1413 if (!check_maxchar
1414 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1415 {
1416 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1417 Py_UCS4 ch;
1418 Py_ssize_t i;
1419 for (i=0; i < how_many; i++) {
1420 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1421 assert(ch <= to_maxchar);
1422 }
1423 }
1424#endif
1425
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001426 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001427 if (check_maxchar
1428 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1429 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001430 /* Writing Latin-1 characters into an ASCII string requires to
1431 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001432 Py_UCS4 max_char;
1433 max_char = ucs1lib_find_max_char(from_data,
1434 (Py_UCS1*)from_data + how_many);
1435 if (max_char >= 128)
1436 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001437 }
Christian Heimesf051e432016-09-13 20:22:02 +02001438 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001439 (char*)from_data + from_kind * from_start,
1440 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001442 else if (from_kind == PyUnicode_1BYTE_KIND
1443 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001444 {
1445 _PyUnicode_CONVERT_BYTES(
1446 Py_UCS1, Py_UCS2,
1447 PyUnicode_1BYTE_DATA(from) + from_start,
1448 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1449 PyUnicode_2BYTE_DATA(to) + to_start
1450 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001451 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001452 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001453 && to_kind == PyUnicode_4BYTE_KIND)
1454 {
1455 _PyUnicode_CONVERT_BYTES(
1456 Py_UCS1, Py_UCS4,
1457 PyUnicode_1BYTE_DATA(from) + from_start,
1458 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1459 PyUnicode_4BYTE_DATA(to) + to_start
1460 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001461 }
1462 else if (from_kind == PyUnicode_2BYTE_KIND
1463 && to_kind == PyUnicode_4BYTE_KIND)
1464 {
1465 _PyUnicode_CONVERT_BYTES(
1466 Py_UCS2, Py_UCS4,
1467 PyUnicode_2BYTE_DATA(from) + from_start,
1468 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1469 PyUnicode_4BYTE_DATA(to) + to_start
1470 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001471 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001472 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001473 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1474
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001475 if (!check_maxchar) {
1476 if (from_kind == PyUnicode_2BYTE_KIND
1477 && to_kind == PyUnicode_1BYTE_KIND)
1478 {
1479 _PyUnicode_CONVERT_BYTES(
1480 Py_UCS2, Py_UCS1,
1481 PyUnicode_2BYTE_DATA(from) + from_start,
1482 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1483 PyUnicode_1BYTE_DATA(to) + to_start
1484 );
1485 }
1486 else if (from_kind == PyUnicode_4BYTE_KIND
1487 && to_kind == PyUnicode_1BYTE_KIND)
1488 {
1489 _PyUnicode_CONVERT_BYTES(
1490 Py_UCS4, Py_UCS1,
1491 PyUnicode_4BYTE_DATA(from) + from_start,
1492 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1493 PyUnicode_1BYTE_DATA(to) + to_start
1494 );
1495 }
1496 else if (from_kind == PyUnicode_4BYTE_KIND
1497 && to_kind == PyUnicode_2BYTE_KIND)
1498 {
1499 _PyUnicode_CONVERT_BYTES(
1500 Py_UCS4, Py_UCS2,
1501 PyUnicode_4BYTE_DATA(from) + from_start,
1502 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1503 PyUnicode_2BYTE_DATA(to) + to_start
1504 );
1505 }
1506 else {
1507 assert(0);
1508 return -1;
1509 }
1510 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001511 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001513 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001514 Py_ssize_t i;
1515
Victor Stinnera0702ab2011-09-29 14:14:38 +02001516 for (i=0; i < how_many; i++) {
1517 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001518 if (ch > to_maxchar)
1519 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001520 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1521 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001522 }
1523 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524 return 0;
1525}
1526
Victor Stinnerd3f08822012-05-29 12:57:52 +02001527void
1528_PyUnicode_FastCopyCharacters(
1529 PyObject *to, Py_ssize_t to_start,
1530 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001531{
1532 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1533}
1534
1535Py_ssize_t
1536PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1537 PyObject *from, Py_ssize_t from_start,
1538 Py_ssize_t how_many)
1539{
1540 int err;
1541
1542 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1543 PyErr_BadInternalCall();
1544 return -1;
1545 }
1546
Benjamin Petersonbac79492012-01-14 13:34:47 -05001547 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001549 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001550 return -1;
1551
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001552 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001553 PyErr_SetString(PyExc_IndexError, "string index out of range");
1554 return -1;
1555 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001556 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001557 PyErr_SetString(PyExc_IndexError, "string index out of range");
1558 return -1;
1559 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001560 if (how_many < 0) {
1561 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1562 return -1;
1563 }
1564 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1566 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001567 "Cannot write %zi characters at %zi "
1568 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001569 how_many, to_start, PyUnicode_GET_LENGTH(to));
1570 return -1;
1571 }
1572
1573 if (how_many == 0)
1574 return 0;
1575
Victor Stinner488fa492011-12-12 00:01:39 +01001576 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001577 return -1;
1578
1579 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1580 if (err) {
1581 PyErr_Format(PyExc_SystemError,
1582 "Cannot copy %s characters "
1583 "into a string of %s characters",
1584 unicode_kind_name(from),
1585 unicode_kind_name(to));
1586 return -1;
1587 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001588 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589}
1590
Victor Stinner17222162011-09-28 22:15:37 +02001591/* Find the maximum code point and count the number of surrogate pairs so a
1592 correct string length can be computed before converting a string to UCS4.
1593 This function counts single surrogates as a character and not as a pair.
1594
1595 Return 0 on success, or -1 on error. */
1596static int
1597find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1598 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599{
1600 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001601 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602
Victor Stinnerc53be962011-10-02 21:33:54 +02001603 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604 *num_surrogates = 0;
1605 *maxchar = 0;
1606
1607 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001608#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001609 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1610 && (iter+1) < end
1611 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1612 {
1613 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1614 ++(*num_surrogates);
1615 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616 }
1617 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001618#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001619 {
1620 ch = *iter;
1621 iter++;
1622 }
1623 if (ch > *maxchar) {
1624 *maxchar = ch;
1625 if (*maxchar > MAX_UNICODE) {
1626 PyErr_Format(PyExc_ValueError,
1627 "character U+%x is not in range [U+0000; U+10ffff]",
1628 ch);
1629 return -1;
1630 }
1631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632 }
1633 return 0;
1634}
1635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001636int
1637_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638{
1639 wchar_t *end;
1640 Py_UCS4 maxchar = 0;
1641 Py_ssize_t num_surrogates;
1642#if SIZEOF_WCHAR_T == 2
1643 Py_ssize_t length_wo_surrogates;
1644#endif
1645
Georg Brandl7597add2011-10-05 16:36:47 +02001646 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001647 strings were created using _PyObject_New() and where no canonical
1648 representation (the str field) has been set yet aka strings
1649 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001650 assert(_PyUnicode_CHECK(unicode));
1651 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001653 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001654 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001655 /* Actually, it should neither be interned nor be anything else: */
1656 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001659 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001660 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662
1663 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001664 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1665 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 PyErr_NoMemory();
1667 return -1;
1668 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001669 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001670 _PyUnicode_WSTR(unicode), end,
1671 PyUnicode_1BYTE_DATA(unicode));
1672 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1673 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1674 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1675 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001676 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001677 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001678 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 }
1680 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001681 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001682 _PyUnicode_UTF8(unicode) = NULL;
1683 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 }
1685 PyObject_FREE(_PyUnicode_WSTR(unicode));
1686 _PyUnicode_WSTR(unicode) = NULL;
1687 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1688 }
1689 /* In this case we might have to convert down from 4-byte native
1690 wchar_t to 2-byte unicode. */
1691 else if (maxchar < 65536) {
1692 assert(num_surrogates == 0 &&
1693 "FindMaxCharAndNumSurrogatePairs() messed up");
1694
Victor Stinner506f5922011-09-28 22:34:18 +02001695#if SIZEOF_WCHAR_T == 2
1696 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001697 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001698 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1699 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1700 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001701 _PyUnicode_UTF8(unicode) = NULL;
1702 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001703#else
1704 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001705 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001706 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001707 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001708 PyErr_NoMemory();
1709 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 }
Victor Stinner506f5922011-09-28 22:34:18 +02001711 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1712 _PyUnicode_WSTR(unicode), end,
1713 PyUnicode_2BYTE_DATA(unicode));
1714 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1715 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1716 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001717 _PyUnicode_UTF8(unicode) = NULL;
1718 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001719 PyObject_FREE(_PyUnicode_WSTR(unicode));
1720 _PyUnicode_WSTR(unicode) = NULL;
1721 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1722#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 }
1724 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1725 else {
1726#if SIZEOF_WCHAR_T == 2
1727 /* in case the native representation is 2-bytes, we need to allocate a
1728 new normalized 4-byte version. */
1729 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001730 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1731 PyErr_NoMemory();
1732 return -1;
1733 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001734 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1735 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 PyErr_NoMemory();
1737 return -1;
1738 }
1739 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1740 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001741 _PyUnicode_UTF8(unicode) = NULL;
1742 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001743 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1744 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001745 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746 PyObject_FREE(_PyUnicode_WSTR(unicode));
1747 _PyUnicode_WSTR(unicode) = NULL;
1748 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1749#else
1750 assert(num_surrogates == 0);
1751
Victor Stinnerc3c74152011-10-02 20:39:55 +02001752 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001754 _PyUnicode_UTF8(unicode) = NULL;
1755 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1757#endif
1758 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1759 }
1760 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001761 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 return 0;
1763}
1764
Alexander Belopolsky40018472011-02-26 01:02:56 +00001765static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001766unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767{
Walter Dörwald16807132007-05-25 13:52:07 +00001768 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001769 case SSTATE_NOT_INTERNED:
1770 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001771
Benjamin Peterson29060642009-01-31 22:14:21 +00001772 case SSTATE_INTERNED_MORTAL:
1773 /* revive dead object temporarily for DelItem */
1774 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001775 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001776 Py_FatalError(
1777 "deletion of interned string failed");
1778 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001779
Benjamin Peterson29060642009-01-31 22:14:21 +00001780 case SSTATE_INTERNED_IMMORTAL:
1781 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001782
Benjamin Peterson29060642009-01-31 22:14:21 +00001783 default:
1784 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001785 }
1786
Victor Stinner03490912011-10-03 23:45:12 +02001787 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001789 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001790 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001791 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1792 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001794 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795}
1796
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001797#ifdef Py_DEBUG
1798static int
1799unicode_is_singleton(PyObject *unicode)
1800{
1801 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1802 if (unicode == unicode_empty)
1803 return 1;
1804 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1805 {
1806 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1807 if (ch < 256 && unicode_latin1[ch] == unicode)
1808 return 1;
1809 }
1810 return 0;
1811}
1812#endif
1813
Alexander Belopolsky40018472011-02-26 01:02:56 +00001814static int
Victor Stinner488fa492011-12-12 00:01:39 +01001815unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001816{
Victor Stinner488fa492011-12-12 00:01:39 +01001817 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001818 if (Py_REFCNT(unicode) != 1)
1819 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001820 if (_PyUnicode_HASH(unicode) != -1)
1821 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001822 if (PyUnicode_CHECK_INTERNED(unicode))
1823 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001824 if (!PyUnicode_CheckExact(unicode))
1825 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001826#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001827 /* singleton refcount is greater than 1 */
1828 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001829#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001830 return 1;
1831}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001832
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833static int
1834unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1835{
1836 PyObject *unicode;
1837 Py_ssize_t old_length;
1838
1839 assert(p_unicode != NULL);
1840 unicode = *p_unicode;
1841
1842 assert(unicode != NULL);
1843 assert(PyUnicode_Check(unicode));
1844 assert(0 <= length);
1845
Victor Stinner910337b2011-10-03 03:20:16 +02001846 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001847 old_length = PyUnicode_WSTR_LENGTH(unicode);
1848 else
1849 old_length = PyUnicode_GET_LENGTH(unicode);
1850 if (old_length == length)
1851 return 0;
1852
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001853 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001854 _Py_INCREF_UNICODE_EMPTY();
1855 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001856 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001857 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001858 return 0;
1859 }
1860
Victor Stinner488fa492011-12-12 00:01:39 +01001861 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 PyObject *copy = resize_copy(unicode, length);
1863 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001864 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001865 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001866 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001867 }
1868
Victor Stinnerfe226c02011-10-03 03:52:20 +02001869 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001870 PyObject *new_unicode = resize_compact(unicode, length);
1871 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001872 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001873 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001874 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001875 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001876 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001877}
1878
Alexander Belopolsky40018472011-02-26 01:02:56 +00001879int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001880PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001881{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001882 PyObject *unicode;
1883 if (p_unicode == NULL) {
1884 PyErr_BadInternalCall();
1885 return -1;
1886 }
1887 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001888 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 {
1890 PyErr_BadInternalCall();
1891 return -1;
1892 }
1893 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001894}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001895
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001896/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001897
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001898 WARNING: The function doesn't copy the terminating null character and
1899 doesn't check the maximum character (may write a latin1 character in an
1900 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001901static void
1902unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1903 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001904{
1905 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1906 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001907 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001908
1909 switch (kind) {
1910 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001911 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001912#ifdef Py_DEBUG
1913 if (PyUnicode_IS_ASCII(unicode)) {
1914 Py_UCS4 maxchar = ucs1lib_find_max_char(
1915 (const Py_UCS1*)str,
1916 (const Py_UCS1*)str + len);
1917 assert(maxchar < 128);
1918 }
1919#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001920 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001921 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001922 }
1923 case PyUnicode_2BYTE_KIND: {
1924 Py_UCS2 *start = (Py_UCS2 *)data + index;
1925 Py_UCS2 *ucs2 = start;
1926 assert(index <= PyUnicode_GET_LENGTH(unicode));
1927
Victor Stinner184252a2012-06-16 02:57:41 +02001928 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001929 *ucs2 = (Py_UCS2)*str;
1930
1931 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001932 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001933 }
1934 default: {
1935 Py_UCS4 *start = (Py_UCS4 *)data + index;
1936 Py_UCS4 *ucs4 = start;
1937 assert(kind == PyUnicode_4BYTE_KIND);
1938 assert(index <= PyUnicode_GET_LENGTH(unicode));
1939
Victor Stinner184252a2012-06-16 02:57:41 +02001940 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001941 *ucs4 = (Py_UCS4)*str;
1942
1943 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001944 }
1945 }
1946}
1947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948static PyObject*
1949get_latin1_char(unsigned char ch)
1950{
Victor Stinnera464fc12011-10-02 20:39:30 +02001951 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001953 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 if (!unicode)
1955 return NULL;
1956 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001957 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 unicode_latin1[ch] = unicode;
1959 }
1960 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001961 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001962}
1963
Victor Stinner985a82a2014-01-03 12:53:47 +01001964static PyObject*
1965unicode_char(Py_UCS4 ch)
1966{
1967 PyObject *unicode;
1968
1969 assert(ch <= MAX_UNICODE);
1970
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001971 if (ch < 256)
1972 return get_latin1_char(ch);
1973
Victor Stinner985a82a2014-01-03 12:53:47 +01001974 unicode = PyUnicode_New(1, ch);
1975 if (unicode == NULL)
1976 return NULL;
1977 switch (PyUnicode_KIND(unicode)) {
1978 case PyUnicode_1BYTE_KIND:
1979 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1980 break;
1981 case PyUnicode_2BYTE_KIND:
1982 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1983 break;
1984 default:
1985 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1986 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1987 }
1988 assert(_PyUnicode_CheckConsistency(unicode, 1));
1989 return unicode;
1990}
1991
Alexander Belopolsky40018472011-02-26 01:02:56 +00001992PyObject *
1993PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001995 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 Py_UCS4 maxchar = 0;
1997 Py_ssize_t num_surrogates;
1998
1999 if (u == NULL)
2000 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002002 /* If the Unicode data is known at construction time, we can apply
2003 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002006 if (size == 0)
2007 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 /* Single character Unicode objects in the Latin-1 range are
2010 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002011 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 return get_latin1_char((unsigned char)*u);
2013
2014 /* If not empty and not single character, copy the Unicode data
2015 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002016 if (find_maxchar_surrogates(u, u + size,
2017 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018 return NULL;
2019
Victor Stinner8faf8212011-12-08 22:14:11 +01002020 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 if (!unicode)
2022 return NULL;
2023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 switch (PyUnicode_KIND(unicode)) {
2025 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002026 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2028 break;
2029 case PyUnicode_2BYTE_KIND:
2030#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002031 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002033 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2035#endif
2036 break;
2037 case PyUnicode_4BYTE_KIND:
2038#if SIZEOF_WCHAR_T == 2
2039 /* This is the only case which has to process surrogates, thus
2040 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002041 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042#else
2043 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002044 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045#endif
2046 break;
2047 default:
2048 assert(0 && "Impossible state");
2049 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002051 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052}
2053
Alexander Belopolsky40018472011-02-26 01:02:56 +00002054PyObject *
2055PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002056{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002057 if (size < 0) {
2058 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002059 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002060 return NULL;
2061 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002062 if (u != NULL)
2063 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2064 else
2065 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002066}
2067
Alexander Belopolsky40018472011-02-26 01:02:56 +00002068PyObject *
2069PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002070{
2071 size_t size = strlen(u);
2072 if (size > PY_SSIZE_T_MAX) {
2073 PyErr_SetString(PyExc_OverflowError, "input too long");
2074 return NULL;
2075 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002076 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002077}
2078
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002079PyObject *
2080_PyUnicode_FromId(_Py_Identifier *id)
2081{
2082 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002083 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2084 strlen(id->string),
2085 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002086 if (!id->object)
2087 return NULL;
2088 PyUnicode_InternInPlace(&id->object);
2089 assert(!id->next);
2090 id->next = static_strings;
2091 static_strings = id;
2092 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002093 return id->object;
2094}
2095
2096void
2097_PyUnicode_ClearStaticStrings()
2098{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002099 _Py_Identifier *tmp, *s = static_strings;
2100 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002101 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002102 tmp = s->next;
2103 s->next = NULL;
2104 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002105 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002106 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002107}
2108
Benjamin Peterson0df54292012-03-26 14:50:32 -04002109/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002110
Victor Stinnerd3f08822012-05-29 12:57:52 +02002111PyObject*
2112_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002113{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002114 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002115 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002116 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002117#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002118 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002119#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002120 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002121 }
Victor Stinner785938e2011-12-11 20:09:03 +01002122 unicode = PyUnicode_New(size, 127);
2123 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002124 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002125 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2126 assert(_PyUnicode_CheckConsistency(unicode, 1));
2127 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002128}
2129
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002130static Py_UCS4
2131kind_maxchar_limit(unsigned int kind)
2132{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002133 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002134 case PyUnicode_1BYTE_KIND:
2135 return 0x80;
2136 case PyUnicode_2BYTE_KIND:
2137 return 0x100;
2138 case PyUnicode_4BYTE_KIND:
2139 return 0x10000;
2140 default:
2141 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002142 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002143 }
2144}
2145
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002146static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002147align_maxchar(Py_UCS4 maxchar)
2148{
2149 if (maxchar <= 127)
2150 return 127;
2151 else if (maxchar <= 255)
2152 return 255;
2153 else if (maxchar <= 65535)
2154 return 65535;
2155 else
2156 return MAX_UNICODE;
2157}
2158
Victor Stinner702c7342011-10-05 13:50:52 +02002159static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002160_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002163 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002164
Serhiy Storchaka678db842013-01-26 12:16:36 +02002165 if (size == 0)
2166 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002167 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002168 if (size == 1)
2169 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002170
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002171 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002172 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 if (!res)
2174 return NULL;
2175 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002176 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002178}
2179
Victor Stinnere57b1c02011-09-28 22:20:48 +02002180static PyObject*
2181_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182{
2183 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002184 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002185
Serhiy Storchaka678db842013-01-26 12:16:36 +02002186 if (size == 0)
2187 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002188 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002189 if (size == 1)
2190 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002191
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002192 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002193 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 if (!res)
2195 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002196 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002198 else {
2199 _PyUnicode_CONVERT_BYTES(
2200 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2201 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002202 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 return res;
2204}
2205
Victor Stinnere57b1c02011-09-28 22:20:48 +02002206static PyObject*
2207_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208{
2209 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002210 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002211
Serhiy Storchaka678db842013-01-26 12:16:36 +02002212 if (size == 0)
2213 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002214 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002215 if (size == 1)
2216 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002217
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002218 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002219 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 if (!res)
2221 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002222 if (max_char < 256)
2223 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2224 PyUnicode_1BYTE_DATA(res));
2225 else if (max_char < 0x10000)
2226 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2227 PyUnicode_2BYTE_DATA(res));
2228 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002230 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 return res;
2232}
2233
2234PyObject*
2235PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2236{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002237 if (size < 0) {
2238 PyErr_SetString(PyExc_ValueError, "size must be positive");
2239 return NULL;
2240 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002241 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002243 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002245 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002247 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002248 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002249 PyErr_SetString(PyExc_SystemError, "invalid kind");
2250 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252}
2253
Victor Stinnerece58de2012-04-23 23:36:38 +02002254Py_UCS4
2255_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2256{
2257 enum PyUnicode_Kind kind;
2258 void *startptr, *endptr;
2259
2260 assert(PyUnicode_IS_READY(unicode));
2261 assert(0 <= start);
2262 assert(end <= PyUnicode_GET_LENGTH(unicode));
2263 assert(start <= end);
2264
2265 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2266 return PyUnicode_MAX_CHAR_VALUE(unicode);
2267
2268 if (start == end)
2269 return 127;
2270
Victor Stinner94d558b2012-04-27 22:26:58 +02002271 if (PyUnicode_IS_ASCII(unicode))
2272 return 127;
2273
Victor Stinnerece58de2012-04-23 23:36:38 +02002274 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002275 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002276 endptr = (char *)startptr + end * kind;
2277 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002278 switch(kind) {
2279 case PyUnicode_1BYTE_KIND:
2280 return ucs1lib_find_max_char(startptr, endptr);
2281 case PyUnicode_2BYTE_KIND:
2282 return ucs2lib_find_max_char(startptr, endptr);
2283 case PyUnicode_4BYTE_KIND:
2284 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002285 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002286 assert(0);
2287 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002288 }
2289}
2290
Victor Stinner25a4b292011-10-06 12:31:55 +02002291/* Ensure that a string uses the most efficient storage, if it is not the
2292 case: create a new string with of the right kind. Write NULL into *p_unicode
2293 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002294static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002295unicode_adjust_maxchar(PyObject **p_unicode)
2296{
2297 PyObject *unicode, *copy;
2298 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002299 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002300 unsigned int kind;
2301
2302 assert(p_unicode != NULL);
2303 unicode = *p_unicode;
2304 assert(PyUnicode_IS_READY(unicode));
2305 if (PyUnicode_IS_ASCII(unicode))
2306 return;
2307
2308 len = PyUnicode_GET_LENGTH(unicode);
2309 kind = PyUnicode_KIND(unicode);
2310 if (kind == PyUnicode_1BYTE_KIND) {
2311 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002312 max_char = ucs1lib_find_max_char(u, u + len);
2313 if (max_char >= 128)
2314 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002315 }
2316 else if (kind == PyUnicode_2BYTE_KIND) {
2317 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002318 max_char = ucs2lib_find_max_char(u, u + len);
2319 if (max_char >= 256)
2320 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002321 }
2322 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002323 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002324 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002325 max_char = ucs4lib_find_max_char(u, u + len);
2326 if (max_char >= 0x10000)
2327 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002328 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002329 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002330 if (copy != NULL)
2331 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002332 Py_DECREF(unicode);
2333 *p_unicode = copy;
2334}
2335
Victor Stinner034f6cf2011-09-30 02:26:44 +02002336PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002337_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002338{
Victor Stinner87af4f22011-11-21 23:03:47 +01002339 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002340 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002341
Victor Stinner034f6cf2011-09-30 02:26:44 +02002342 if (!PyUnicode_Check(unicode)) {
2343 PyErr_BadInternalCall();
2344 return NULL;
2345 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002346 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002347 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002348
Victor Stinner87af4f22011-11-21 23:03:47 +01002349 length = PyUnicode_GET_LENGTH(unicode);
2350 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002351 if (!copy)
2352 return NULL;
2353 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2354
Christian Heimesf051e432016-09-13 20:22:02 +02002355 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002356 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002357 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002358 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002359}
2360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361
Victor Stinnerbc603d12011-10-02 01:00:40 +02002362/* Widen Unicode objects to larger buffers. Don't write terminating null
2363 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002364
2365void*
2366_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2367{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002368 Py_ssize_t len;
2369 void *result;
2370 unsigned int skind;
2371
Benjamin Petersonbac79492012-01-14 13:34:47 -05002372 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002373 return NULL;
2374
2375 len = PyUnicode_GET_LENGTH(s);
2376 skind = PyUnicode_KIND(s);
2377 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002378 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002379 return NULL;
2380 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002381 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002382 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002383 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002384 if (!result)
2385 return PyErr_NoMemory();
2386 assert(skind == PyUnicode_1BYTE_KIND);
2387 _PyUnicode_CONVERT_BYTES(
2388 Py_UCS1, Py_UCS2,
2389 PyUnicode_1BYTE_DATA(s),
2390 PyUnicode_1BYTE_DATA(s) + len,
2391 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002393 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002394 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002395 if (!result)
2396 return PyErr_NoMemory();
2397 if (skind == PyUnicode_2BYTE_KIND) {
2398 _PyUnicode_CONVERT_BYTES(
2399 Py_UCS2, Py_UCS4,
2400 PyUnicode_2BYTE_DATA(s),
2401 PyUnicode_2BYTE_DATA(s) + len,
2402 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002404 else {
2405 assert(skind == PyUnicode_1BYTE_KIND);
2406 _PyUnicode_CONVERT_BYTES(
2407 Py_UCS1, Py_UCS4,
2408 PyUnicode_1BYTE_DATA(s),
2409 PyUnicode_1BYTE_DATA(s) + len,
2410 result);
2411 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002413 default:
2414 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 }
Victor Stinner01698042011-10-04 00:04:26 +02002416 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 return NULL;
2418}
2419
2420static Py_UCS4*
2421as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2422 int copy_null)
2423{
2424 int kind;
2425 void *data;
2426 Py_ssize_t len, targetlen;
2427 if (PyUnicode_READY(string) == -1)
2428 return NULL;
2429 kind = PyUnicode_KIND(string);
2430 data = PyUnicode_DATA(string);
2431 len = PyUnicode_GET_LENGTH(string);
2432 targetlen = len;
2433 if (copy_null)
2434 targetlen++;
2435 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002436 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437 if (!target) {
2438 PyErr_NoMemory();
2439 return NULL;
2440 }
2441 }
2442 else {
2443 if (targetsize < targetlen) {
2444 PyErr_Format(PyExc_SystemError,
2445 "string is longer than the buffer");
2446 if (copy_null && 0 < targetsize)
2447 target[0] = 0;
2448 return NULL;
2449 }
2450 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002451 if (kind == PyUnicode_1BYTE_KIND) {
2452 Py_UCS1 *start = (Py_UCS1 *) data;
2453 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002455 else if (kind == PyUnicode_2BYTE_KIND) {
2456 Py_UCS2 *start = (Py_UCS2 *) data;
2457 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2458 }
2459 else {
2460 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002461 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 if (copy_null)
2464 target[len] = 0;
2465 return target;
2466}
2467
2468Py_UCS4*
2469PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2470 int copy_null)
2471{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002472 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 PyErr_BadInternalCall();
2474 return NULL;
2475 }
2476 return as_ucs4(string, target, targetsize, copy_null);
2477}
2478
2479Py_UCS4*
2480PyUnicode_AsUCS4Copy(PyObject *string)
2481{
2482 return as_ucs4(string, NULL, 0, 1);
2483}
2484
2485#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002486
Alexander Belopolsky40018472011-02-26 01:02:56 +00002487PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002488PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002492 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002493 PyErr_BadInternalCall();
2494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 }
2496
Martin v. Löwis790465f2008-04-05 20:41:37 +00002497 if (size == -1) {
2498 size = wcslen(w);
2499 }
2500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002501 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502}
2503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002505
Victor Stinner15a11362012-10-06 23:48:20 +02002506/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002507 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2508 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2509#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002510
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002511static int
2512unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2513 Py_ssize_t width, Py_ssize_t precision)
2514{
2515 Py_ssize_t length, fill, arglen;
2516 Py_UCS4 maxchar;
2517
2518 if (PyUnicode_READY(str) == -1)
2519 return -1;
2520
2521 length = PyUnicode_GET_LENGTH(str);
2522 if ((precision == -1 || precision >= length)
2523 && width <= length)
2524 return _PyUnicodeWriter_WriteStr(writer, str);
2525
2526 if (precision != -1)
2527 length = Py_MIN(precision, length);
2528
2529 arglen = Py_MAX(length, width);
2530 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2531 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2532 else
2533 maxchar = writer->maxchar;
2534
2535 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2536 return -1;
2537
2538 if (width > length) {
2539 fill = width - length;
2540 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2541 return -1;
2542 writer->pos += fill;
2543 }
2544
2545 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2546 str, 0, length);
2547 writer->pos += length;
2548 return 0;
2549}
2550
2551static int
2552unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2553 Py_ssize_t width, Py_ssize_t precision)
2554{
2555 /* UTF-8 */
2556 Py_ssize_t length;
2557 PyObject *unicode;
2558 int res;
2559
2560 length = strlen(str);
2561 if (precision != -1)
2562 length = Py_MIN(length, precision);
2563 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2564 if (unicode == NULL)
2565 return -1;
2566
2567 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2568 Py_DECREF(unicode);
2569 return res;
2570}
2571
Victor Stinner96865452011-03-01 23:44:09 +00002572static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002573unicode_fromformat_arg(_PyUnicodeWriter *writer,
2574 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002575{
Victor Stinnere215d962012-10-06 23:03:36 +02002576 const char *p;
2577 Py_ssize_t len;
2578 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002579 Py_ssize_t width;
2580 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002581 int longflag;
2582 int longlongflag;
2583 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002584 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002585
2586 p = f;
2587 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002588 zeropad = 0;
2589 if (*f == '0') {
2590 zeropad = 1;
2591 f++;
2592 }
Victor Stinner96865452011-03-01 23:44:09 +00002593
2594 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 width = -1;
2596 if (Py_ISDIGIT((unsigned)*f)) {
2597 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002598 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002599 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002600 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002601 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002602 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002603 return NULL;
2604 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002605 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002606 f++;
2607 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002608 }
2609 precision = -1;
2610 if (*f == '.') {
2611 f++;
2612 if (Py_ISDIGIT((unsigned)*f)) {
2613 precision = (*f - '0');
2614 f++;
2615 while (Py_ISDIGIT((unsigned)*f)) {
2616 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2617 PyErr_SetString(PyExc_ValueError,
2618 "precision too big");
2619 return NULL;
2620 }
2621 precision = (precision * 10) + (*f - '0');
2622 f++;
2623 }
2624 }
Victor Stinner96865452011-03-01 23:44:09 +00002625 if (*f == '%') {
2626 /* "%.3%s" => f points to "3" */
2627 f--;
2628 }
2629 }
2630 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002631 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002632 f--;
2633 }
Victor Stinner96865452011-03-01 23:44:09 +00002634
2635 /* Handle %ld, %lu, %lld and %llu. */
2636 longflag = 0;
2637 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002638 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002639 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002640 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002641 longflag = 1;
2642 ++f;
2643 }
Victor Stinner96865452011-03-01 23:44:09 +00002644 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002645 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002646 longlongflag = 1;
2647 f += 2;
2648 }
Victor Stinner96865452011-03-01 23:44:09 +00002649 }
2650 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002651 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002652 size_tflag = 1;
2653 ++f;
2654 }
Victor Stinnere215d962012-10-06 23:03:36 +02002655
2656 if (f[1] == '\0')
2657 writer->overallocate = 0;
2658
2659 switch (*f) {
2660 case 'c':
2661 {
2662 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002663 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002664 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002665 "character argument not in range(0x110000)");
2666 return NULL;
2667 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002668 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002669 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002670 break;
2671 }
2672
2673 case 'i':
2674 case 'd':
2675 case 'u':
2676 case 'x':
2677 {
2678 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002679 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002680 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002681
2682 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002683 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002684 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002685 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002686 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002687 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002688 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002689 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002690 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002691 va_arg(*vargs, size_t));
2692 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002693 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002694 va_arg(*vargs, unsigned int));
2695 }
2696 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002697 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002698 }
2699 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002700 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002701 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002702 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002703 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002704 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002705 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002706 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002707 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002708 va_arg(*vargs, Py_ssize_t));
2709 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002710 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002711 va_arg(*vargs, int));
2712 }
2713 assert(len >= 0);
2714
Victor Stinnere215d962012-10-06 23:03:36 +02002715 if (precision < len)
2716 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002717
2718 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002719 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2720 return NULL;
2721
Victor Stinnere215d962012-10-06 23:03:36 +02002722 if (width > precision) {
2723 Py_UCS4 fillchar;
2724 fill = width - precision;
2725 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002726 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2727 return NULL;
2728 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002729 }
Victor Stinner15a11362012-10-06 23:48:20 +02002730 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002731 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002732 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2733 return NULL;
2734 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002735 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002736
Victor Stinner4a587072013-11-19 12:54:53 +01002737 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2738 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002739 break;
2740 }
2741
2742 case 'p':
2743 {
2744 char number[MAX_LONG_LONG_CHARS];
2745
2746 len = sprintf(number, "%p", va_arg(*vargs, void*));
2747 assert(len >= 0);
2748
2749 /* %p is ill-defined: ensure leading 0x. */
2750 if (number[1] == 'X')
2751 number[1] = 'x';
2752 else if (number[1] != 'x') {
2753 memmove(number + 2, number,
2754 strlen(number) + 1);
2755 number[0] = '0';
2756 number[1] = 'x';
2757 len += 2;
2758 }
2759
Victor Stinner4a587072013-11-19 12:54:53 +01002760 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002761 return NULL;
2762 break;
2763 }
2764
2765 case 's':
2766 {
2767 /* UTF-8 */
2768 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002769 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002770 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002771 break;
2772 }
2773
2774 case 'U':
2775 {
2776 PyObject *obj = va_arg(*vargs, PyObject *);
2777 assert(obj && _PyUnicode_CHECK(obj));
2778
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002779 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002780 return NULL;
2781 break;
2782 }
2783
2784 case 'V':
2785 {
2786 PyObject *obj = va_arg(*vargs, PyObject *);
2787 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002788 if (obj) {
2789 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
2792 }
2793 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002794 assert(str != NULL);
2795 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002796 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002797 }
2798 break;
2799 }
2800
2801 case 'S':
2802 {
2803 PyObject *obj = va_arg(*vargs, PyObject *);
2804 PyObject *str;
2805 assert(obj);
2806 str = PyObject_Str(obj);
2807 if (!str)
2808 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002809 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002810 Py_DECREF(str);
2811 return NULL;
2812 }
2813 Py_DECREF(str);
2814 break;
2815 }
2816
2817 case 'R':
2818 {
2819 PyObject *obj = va_arg(*vargs, PyObject *);
2820 PyObject *repr;
2821 assert(obj);
2822 repr = PyObject_Repr(obj);
2823 if (!repr)
2824 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002825 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002826 Py_DECREF(repr);
2827 return NULL;
2828 }
2829 Py_DECREF(repr);
2830 break;
2831 }
2832
2833 case 'A':
2834 {
2835 PyObject *obj = va_arg(*vargs, PyObject *);
2836 PyObject *ascii;
2837 assert(obj);
2838 ascii = PyObject_ASCII(obj);
2839 if (!ascii)
2840 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002841 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002842 Py_DECREF(ascii);
2843 return NULL;
2844 }
2845 Py_DECREF(ascii);
2846 break;
2847 }
2848
2849 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002850 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002851 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002852 break;
2853
2854 default:
2855 /* if we stumble upon an unknown formatting code, copy the rest
2856 of the format string to the output string. (we cannot just
2857 skip the code, since there's no way to know what's in the
2858 argument list) */
2859 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002860 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002861 return NULL;
2862 f = p+len;
2863 return f;
2864 }
2865
2866 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002867 return f;
2868}
2869
Walter Dörwaldd2034312007-05-18 16:29:38 +00002870PyObject *
2871PyUnicode_FromFormatV(const char *format, va_list vargs)
2872{
Victor Stinnere215d962012-10-06 23:03:36 +02002873 va_list vargs2;
2874 const char *f;
2875 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002876
Victor Stinner8f674cc2013-04-17 23:02:17 +02002877 _PyUnicodeWriter_Init(&writer);
2878 writer.min_length = strlen(format) + 100;
2879 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002880
Benjamin Peterson0c212142016-09-20 20:39:33 -07002881 // Copy varags to be able to pass a reference to a subfunction.
2882 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002883
2884 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002885 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002886 f = unicode_fromformat_arg(&writer, f, &vargs2);
2887 if (f == NULL)
2888 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002890 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002891 const char *p;
2892 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002893
Victor Stinnere215d962012-10-06 23:03:36 +02002894 p = f;
2895 do
2896 {
2897 if ((unsigned char)*p > 127) {
2898 PyErr_Format(PyExc_ValueError,
2899 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2900 "string, got a non-ASCII byte: 0x%02x",
2901 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002902 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002903 }
2904 p++;
2905 }
2906 while (*p != '\0' && *p != '%');
2907 len = p - f;
2908
2909 if (*p == '\0')
2910 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002911
2912 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002913 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002914
2915 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002916 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002917 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002918 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002919 return _PyUnicodeWriter_Finish(&writer);
2920
2921 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002922 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002923 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002924 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002925}
2926
Walter Dörwaldd2034312007-05-18 16:29:38 +00002927PyObject *
2928PyUnicode_FromFormat(const char *format, ...)
2929{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002930 PyObject* ret;
2931 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002932
2933#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002934 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002935#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002936 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002937#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002938 ret = PyUnicode_FromFormatV(format, vargs);
2939 va_end(vargs);
2940 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002941}
2942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002943#ifdef HAVE_WCHAR_H
2944
Victor Stinner5593d8a2010-10-02 11:11:27 +00002945/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2946 convert a Unicode object to a wide character string.
2947
Victor Stinnerd88d9832011-09-06 02:00:05 +02002948 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002949 character) required to convert the unicode object. Ignore size argument.
2950
Victor Stinnerd88d9832011-09-06 02:00:05 +02002951 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002952 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002953 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002954static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002955unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002956 wchar_t *w,
2957 Py_ssize_t size)
2958{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002959 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002960 const wchar_t *wstr;
2961
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002962 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002963 if (wstr == NULL)
2964 return -1;
2965
Victor Stinner5593d8a2010-10-02 11:11:27 +00002966 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002967 if (size > res)
2968 size = res + 1;
2969 else
2970 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002971 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002972 return res;
2973 }
2974 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002975 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002976}
2977
2978Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002979PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002980 wchar_t *w,
2981 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982{
2983 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002984 PyErr_BadInternalCall();
2985 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002987 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988}
2989
Victor Stinner137c34c2010-09-29 10:25:54 +00002990wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002991PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002992 Py_ssize_t *size)
2993{
2994 wchar_t* buffer;
2995 Py_ssize_t buflen;
2996
2997 if (unicode == NULL) {
2998 PyErr_BadInternalCall();
2999 return NULL;
3000 }
3001
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003002 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003003 if (buflen == -1)
3004 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003005 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003006 if (buffer == NULL) {
3007 PyErr_NoMemory();
3008 return NULL;
3009 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003010 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003011 if (buflen == -1) {
3012 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003013 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003014 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003015 if (size != NULL)
3016 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003017 return buffer;
3018}
3019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003020#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021
Alexander Belopolsky40018472011-02-26 01:02:56 +00003022PyObject *
3023PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003024{
Victor Stinner8faf8212011-12-08 22:14:11 +01003025 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003026 PyErr_SetString(PyExc_ValueError,
3027 "chr() arg not in range(0x110000)");
3028 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003029 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003030
Victor Stinner985a82a2014-01-03 12:53:47 +01003031 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003032}
3033
Alexander Belopolsky40018472011-02-26 01:02:56 +00003034PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003035PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003037 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003038 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003039 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003040 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003041 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 Py_INCREF(obj);
3043 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003044 }
3045 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003046 /* For a Unicode subtype that's not a Unicode object,
3047 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003048 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003049 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003050 PyErr_Format(PyExc_TypeError,
3051 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003052 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003053 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003054}
3055
Alexander Belopolsky40018472011-02-26 01:02:56 +00003056PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003057PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003058 const char *encoding,
3059 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003060{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003061 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003062 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003063
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003065 PyErr_BadInternalCall();
3066 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003068
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003069 /* Decoding bytes objects is the most common case and should be fast */
3070 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003071 if (PyBytes_GET_SIZE(obj) == 0)
3072 _Py_RETURN_UNICODE_EMPTY();
3073 v = PyUnicode_Decode(
3074 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3075 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003076 return v;
3077 }
3078
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003079 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003080 PyErr_SetString(PyExc_TypeError,
3081 "decoding str is not supported");
3082 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003083 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003084
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003085 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3086 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3087 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003088 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003089 Py_TYPE(obj)->tp_name);
3090 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003091 }
Tim Petersced69f82003-09-16 20:30:58 +00003092
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003093 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003094 PyBuffer_Release(&buffer);
3095 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003097
Serhiy Storchaka05997252013-01-26 12:14:02 +02003098 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003099 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003100 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101}
3102
Victor Stinnerebe17e02016-10-12 13:57:45 +02003103/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3104 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3105 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003106int
3107_Py_normalize_encoding(const char *encoding,
3108 char *lower,
3109 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003111 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003112 char *l;
3113 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003114 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003115
Victor Stinner942889a2016-09-05 15:40:10 -07003116 assert(encoding != NULL);
3117
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003118 e = encoding;
3119 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003120 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003121 punct = 0;
3122 while (1) {
3123 char c = *e;
3124 if (c == 0) {
3125 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003126 }
Victor Stinner942889a2016-09-05 15:40:10 -07003127
3128 if (Py_ISALNUM(c) || c == '.') {
3129 if (punct && l != lower) {
3130 if (l == l_end) {
3131 return 0;
3132 }
3133 *l++ = '_';
3134 }
3135 punct = 0;
3136
3137 if (l == l_end) {
3138 return 0;
3139 }
3140 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003141 }
3142 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003143 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003144 }
Victor Stinner942889a2016-09-05 15:40:10 -07003145
3146 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003147 }
3148 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003149 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003150}
3151
Alexander Belopolsky40018472011-02-26 01:02:56 +00003152PyObject *
3153PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003154 Py_ssize_t size,
3155 const char *encoding,
3156 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003157{
3158 PyObject *buffer = NULL, *unicode;
3159 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003160 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3161
3162 if (encoding == NULL) {
3163 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3164 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003165
Fred Drakee4315f52000-05-09 19:53:39 +00003166 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003167 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3168 char *lower = buflower;
3169
3170 /* Fast paths */
3171 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3172 lower += 3;
3173 if (*lower == '_') {
3174 /* Match "utf8" and "utf_8" */
3175 lower++;
3176 }
3177
3178 if (lower[0] == '8' && lower[1] == 0) {
3179 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3180 }
3181 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3182 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3183 }
3184 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3185 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3186 }
3187 }
3188 else {
3189 if (strcmp(lower, "ascii") == 0
3190 || strcmp(lower, "us_ascii") == 0) {
3191 return PyUnicode_DecodeASCII(s, size, errors);
3192 }
Steve Dowercc16be82016-09-08 10:35:16 -07003193 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003194 else if (strcmp(lower, "mbcs") == 0) {
3195 return PyUnicode_DecodeMBCS(s, size, errors);
3196 }
3197 #endif
3198 else if (strcmp(lower, "latin1") == 0
3199 || strcmp(lower, "latin_1") == 0
3200 || strcmp(lower, "iso_8859_1") == 0
3201 || strcmp(lower, "iso8859_1") == 0) {
3202 return PyUnicode_DecodeLatin1(s, size, errors);
3203 }
3204 }
Victor Stinner37296e82010-06-10 13:36:23 +00003205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206
3207 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003208 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003209 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003210 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003211 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 if (buffer == NULL)
3213 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003214 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 if (unicode == NULL)
3216 goto onError;
3217 if (!PyUnicode_Check(unicode)) {
3218 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003219 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3220 "use codecs.decode() to decode to arbitrary types",
3221 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003222 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 Py_DECREF(unicode);
3224 goto onError;
3225 }
3226 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003227 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003228
Benjamin Peterson29060642009-01-31 22:14:21 +00003229 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003230 Py_XDECREF(buffer);
3231 return NULL;
3232}
3233
Alexander Belopolsky40018472011-02-26 01:02:56 +00003234PyObject *
3235PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003236 const char *encoding,
3237 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003238{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003239 if (!PyUnicode_Check(unicode)) {
3240 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003241 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003242 }
3243
3244 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003245 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003246
3247 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003248 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003249}
3250
Alexander Belopolsky40018472011-02-26 01:02:56 +00003251PyObject *
3252PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003253 const char *encoding,
3254 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003255{
3256 PyObject *v;
3257
3258 if (!PyUnicode_Check(unicode)) {
3259 PyErr_BadArgument();
3260 goto onError;
3261 }
3262
3263 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003264 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003265
3266 /* Decode via the codec registry */
3267 v = PyCodec_Decode(unicode, encoding, errors);
3268 if (v == NULL)
3269 goto onError;
3270 if (!PyUnicode_Check(v)) {
3271 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003272 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3273 "use codecs.decode() to decode to arbitrary types",
3274 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003275 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003276 Py_DECREF(v);
3277 goto onError;
3278 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003279 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003280
Benjamin Peterson29060642009-01-31 22:14:21 +00003281 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003282 return NULL;
3283}
3284
Alexander Belopolsky40018472011-02-26 01:02:56 +00003285PyObject *
3286PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003287 Py_ssize_t size,
3288 const char *encoding,
3289 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290{
3291 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003292
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 unicode = PyUnicode_FromUnicode(s, size);
3294 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3297 Py_DECREF(unicode);
3298 return v;
3299}
3300
Alexander Belopolsky40018472011-02-26 01:02:56 +00003301PyObject *
3302PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003303 const char *encoding,
3304 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003305{
3306 PyObject *v;
3307
3308 if (!PyUnicode_Check(unicode)) {
3309 PyErr_BadArgument();
3310 goto onError;
3311 }
3312
3313 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003314 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003315
3316 /* Encode via the codec registry */
3317 v = PyCodec_Encode(unicode, encoding, errors);
3318 if (v == NULL)
3319 goto onError;
3320 return v;
3321
Benjamin Peterson29060642009-01-31 22:14:21 +00003322 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003323 return NULL;
3324}
3325
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003326static size_t
3327wcstombs_errorpos(const wchar_t *wstr)
3328{
3329 size_t len;
3330#if SIZEOF_WCHAR_T == 2
3331 wchar_t buf[3];
3332#else
3333 wchar_t buf[2];
3334#endif
3335 char outbuf[MB_LEN_MAX];
3336 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003337
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003338#if SIZEOF_WCHAR_T == 2
3339 buf[2] = 0;
3340#else
3341 buf[1] = 0;
3342#endif
3343 start = wstr;
3344 while (*wstr != L'\0')
3345 {
3346 previous = wstr;
3347#if SIZEOF_WCHAR_T == 2
3348 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3349 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3350 {
3351 buf[0] = wstr[0];
3352 buf[1] = wstr[1];
3353 wstr += 2;
3354 }
3355 else {
3356 buf[0] = *wstr;
3357 buf[1] = 0;
3358 wstr++;
3359 }
3360#else
3361 buf[0] = *wstr;
3362 wstr++;
3363#endif
3364 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003365 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003366 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003367 }
3368
3369 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003370 return 0;
3371}
3372
Victor Stinner1b579672011-12-17 05:47:23 +01003373static int
3374locale_error_handler(const char *errors, int *surrogateescape)
3375{
Victor Stinner50149202015-09-22 00:26:54 +02003376 _Py_error_handler error_handler = get_error_handler(errors);
3377 switch (error_handler)
3378 {
3379 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003380 *surrogateescape = 0;
3381 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003382 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003383 *surrogateescape = 1;
3384 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003385 default:
3386 PyErr_Format(PyExc_ValueError,
3387 "only 'strict' and 'surrogateescape' error handlers "
3388 "are supported, not '%s'",
3389 errors);
3390 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003391 }
Victor Stinner1b579672011-12-17 05:47:23 +01003392}
3393
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003394PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003395PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003396{
3397 Py_ssize_t wlen, wlen2;
3398 wchar_t *wstr;
3399 PyObject *bytes = NULL;
3400 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003401 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003402 PyObject *exc;
3403 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003404 int surrogateescape;
3405
3406 if (locale_error_handler(errors, &surrogateescape) < 0)
3407 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003408
3409 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3410 if (wstr == NULL)
3411 return NULL;
3412
3413 wlen2 = wcslen(wstr);
3414 if (wlen2 != wlen) {
3415 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003416 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003417 return NULL;
3418 }
3419
3420 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003421 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003422 char *str;
3423
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003424 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003425 if (str == NULL) {
3426 if (error_pos == (size_t)-1) {
3427 PyErr_NoMemory();
3428 PyMem_Free(wstr);
3429 return NULL;
3430 }
3431 else {
3432 goto encode_error;
3433 }
3434 }
3435 PyMem_Free(wstr);
3436
3437 bytes = PyBytes_FromString(str);
3438 PyMem_Free(str);
3439 }
3440 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003441 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003442 size_t len, len2;
3443
3444 len = wcstombs(NULL, wstr, 0);
3445 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003446 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003447 goto encode_error;
3448 }
3449
3450 bytes = PyBytes_FromStringAndSize(NULL, len);
3451 if (bytes == NULL) {
3452 PyMem_Free(wstr);
3453 return NULL;
3454 }
3455
3456 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3457 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003458 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003459 goto encode_error;
3460 }
3461 PyMem_Free(wstr);
3462 }
3463 return bytes;
3464
3465encode_error:
3466 errmsg = strerror(errno);
3467 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003468
3469 if (error_pos == (size_t)-1)
3470 error_pos = wcstombs_errorpos(wstr);
3471
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003472 PyMem_Free(wstr);
3473 Py_XDECREF(bytes);
3474
Victor Stinner2f197072011-12-17 07:08:30 +01003475 if (errmsg != NULL) {
3476 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003477 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003478 if (wstr != NULL) {
3479 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003480 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003481 } else
3482 errmsg = NULL;
3483 }
3484 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003485 reason = PyUnicode_FromString(
3486 "wcstombs() encountered an unencodable "
3487 "wide character");
3488 if (reason == NULL)
3489 return NULL;
3490
3491 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3492 "locale", unicode,
3493 (Py_ssize_t)error_pos,
3494 (Py_ssize_t)(error_pos+1),
3495 reason);
3496 Py_DECREF(reason);
3497 if (exc != NULL) {
3498 PyCodec_StrictErrors(exc);
3499 Py_XDECREF(exc);
3500 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003501 return NULL;
3502}
3503
Victor Stinnerad158722010-10-27 00:25:46 +00003504PyObject *
3505PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003506{
Steve Dowercc16be82016-09-08 10:35:16 -07003507#if defined(__APPLE__)
3508 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003509#else
Victor Stinner793b5312011-04-27 00:24:21 +02003510 PyInterpreterState *interp = PyThreadState_GET()->interp;
3511 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3512 cannot use it to encode and decode filenames before it is loaded. Load
3513 the Python codec requires to encode at least its own filename. Use the C
3514 version of the locale codec until the codec registry is initialized and
3515 the Python codec is loaded.
3516
3517 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3518 cannot only rely on it: check also interp->fscodec_initialized for
3519 subinterpreters. */
3520 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003521 return PyUnicode_AsEncodedString(unicode,
3522 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003523 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003524 }
3525 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003526 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003527 }
Victor Stinnerad158722010-10-27 00:25:46 +00003528#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003529}
3530
Alexander Belopolsky40018472011-02-26 01:02:56 +00003531PyObject *
3532PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003533 const char *encoding,
3534 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535{
3536 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003537 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003538
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539 if (!PyUnicode_Check(unicode)) {
3540 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003541 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542 }
Fred Drakee4315f52000-05-09 19:53:39 +00003543
Victor Stinner942889a2016-09-05 15:40:10 -07003544 if (encoding == NULL) {
3545 return _PyUnicode_AsUTF8String(unicode, errors);
3546 }
3547
Fred Drakee4315f52000-05-09 19:53:39 +00003548 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003549 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3550 char *lower = buflower;
3551
3552 /* Fast paths */
3553 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3554 lower += 3;
3555 if (*lower == '_') {
3556 /* Match "utf8" and "utf_8" */
3557 lower++;
3558 }
3559
3560 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003561 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003562 }
3563 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3564 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3565 }
3566 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3567 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3568 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003569 }
Victor Stinner942889a2016-09-05 15:40:10 -07003570 else {
3571 if (strcmp(lower, "ascii") == 0
3572 || strcmp(lower, "us_ascii") == 0) {
3573 return _PyUnicode_AsASCIIString(unicode, errors);
3574 }
Steve Dowercc16be82016-09-08 10:35:16 -07003575#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003576 else if (strcmp(lower, "mbcs") == 0) {
3577 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3578 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003579#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003580 else if (strcmp(lower, "latin1") == 0 ||
3581 strcmp(lower, "latin_1") == 0 ||
3582 strcmp(lower, "iso_8859_1") == 0 ||
3583 strcmp(lower, "iso8859_1") == 0) {
3584 return _PyUnicode_AsLatin1String(unicode, errors);
3585 }
3586 }
Victor Stinner37296e82010-06-10 13:36:23 +00003587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003588
3589 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003590 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003592 return NULL;
3593
3594 /* The normal path */
3595 if (PyBytes_Check(v))
3596 return v;
3597
3598 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003599 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003600 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003601 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003602
3603 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003604 "encoder %s returned bytearray instead of bytes; "
3605 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003606 encoding);
3607 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003608 Py_DECREF(v);
3609 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003610 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003611
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003612 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3613 Py_DECREF(v);
3614 return b;
3615 }
3616
3617 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003618 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3619 "use codecs.encode() to encode to arbitrary types",
3620 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003621 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003622 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003623 return NULL;
3624}
3625
Alexander Belopolsky40018472011-02-26 01:02:56 +00003626PyObject *
3627PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003628 const char *encoding,
3629 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003630{
3631 PyObject *v;
3632
3633 if (!PyUnicode_Check(unicode)) {
3634 PyErr_BadArgument();
3635 goto onError;
3636 }
3637
3638 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003639 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003640
3641 /* Encode via the codec registry */
3642 v = PyCodec_Encode(unicode, encoding, errors);
3643 if (v == NULL)
3644 goto onError;
3645 if (!PyUnicode_Check(v)) {
3646 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003647 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3648 "use codecs.encode() to encode to arbitrary types",
3649 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003650 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003651 Py_DECREF(v);
3652 goto onError;
3653 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003655
Benjamin Peterson29060642009-01-31 22:14:21 +00003656 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 return NULL;
3658}
3659
Victor Stinner2f197072011-12-17 07:08:30 +01003660static size_t
3661mbstowcs_errorpos(const char *str, size_t len)
3662{
3663#ifdef HAVE_MBRTOWC
3664 const char *start = str;
3665 mbstate_t mbs;
3666 size_t converted;
3667 wchar_t ch;
3668
3669 memset(&mbs, 0, sizeof mbs);
3670 while (len)
3671 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003672 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003673 if (converted == 0)
3674 /* Reached end of string */
3675 break;
3676 if (converted == (size_t)-1 || converted == (size_t)-2) {
3677 /* Conversion error or incomplete character */
3678 return str - start;
3679 }
3680 else {
3681 str += converted;
3682 len -= converted;
3683 }
3684 }
3685 /* failed to find the undecodable byte sequence */
3686 return 0;
3687#endif
3688 return 0;
3689}
3690
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003691PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003692PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003693 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003694{
3695 wchar_t smallbuf[256];
3696 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3697 wchar_t *wstr;
3698 size_t wlen, wlen2;
3699 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003700 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003701 size_t error_pos;
3702 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003703 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3704 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003705
3706 if (locale_error_handler(errors, &surrogateescape) < 0)
3707 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003708
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003709 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3710 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003711 return NULL;
3712 }
3713
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003714 if (surrogateescape) {
3715 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003716 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003717 if (wstr == NULL) {
3718 if (wlen == (size_t)-1)
3719 PyErr_NoMemory();
3720 else
3721 PyErr_SetFromErrno(PyExc_OSError);
3722 return NULL;
3723 }
3724
3725 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003726 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003727 }
3728 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003729 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003730#ifndef HAVE_BROKEN_MBSTOWCS
3731 wlen = mbstowcs(NULL, str, 0);
3732#else
3733 wlen = len;
3734#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003735 if (wlen == (size_t)-1)
3736 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003737 if (wlen+1 <= smallbuf_len) {
3738 wstr = smallbuf;
3739 }
3740 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003741 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003742 if (!wstr)
3743 return PyErr_NoMemory();
3744 }
3745
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003746 wlen2 = mbstowcs(wstr, str, wlen+1);
3747 if (wlen2 == (size_t)-1) {
3748 if (wstr != smallbuf)
3749 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003750 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003751 }
3752#ifdef HAVE_BROKEN_MBSTOWCS
3753 assert(wlen2 == wlen);
3754#endif
3755 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3756 if (wstr != smallbuf)
3757 PyMem_Free(wstr);
3758 }
3759 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003760
3761decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003762 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003763 errmsg = strerror(errno);
3764 assert(errmsg != NULL);
3765
3766 error_pos = mbstowcs_errorpos(str, len);
3767 if (errmsg != NULL) {
3768 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003769 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003770 if (wstr != NULL) {
3771 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003772 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003773 }
Victor Stinner2f197072011-12-17 07:08:30 +01003774 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003775 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003776 reason = PyUnicode_FromString(
3777 "mbstowcs() encountered an invalid multibyte sequence");
3778 if (reason == NULL)
3779 return NULL;
3780
3781 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3782 "locale", str, len,
3783 (Py_ssize_t)error_pos,
3784 (Py_ssize_t)(error_pos+1),
3785 reason);
3786 Py_DECREF(reason);
3787 if (exc != NULL) {
3788 PyCodec_StrictErrors(exc);
3789 Py_XDECREF(exc);
3790 }
3791 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003792}
3793
3794PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003795PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003796{
3797 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003798 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003799}
3800
3801
3802PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003803PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003804 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003805 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3806}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003807
Christian Heimes5894ba72007-11-04 11:43:14 +00003808PyObject*
3809PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3810{
Steve Dowercc16be82016-09-08 10:35:16 -07003811#if defined(__APPLE__)
3812 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003813#else
Victor Stinner793b5312011-04-27 00:24:21 +02003814 PyInterpreterState *interp = PyThreadState_GET()->interp;
3815 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3816 cannot use it to encode and decode filenames before it is loaded. Load
3817 the Python codec requires to encode at least its own filename. Use the C
3818 version of the locale codec until the codec registry is initialized and
3819 the Python codec is loaded.
3820
3821 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3822 cannot only rely on it: check also interp->fscodec_initialized for
3823 subinterpreters. */
3824 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dowercc16be82016-09-08 10:35:16 -07003825 PyObject *res = PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003826 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003827 Py_FileSystemDefaultEncodeErrors);
3828#ifdef MS_WINDOWS
3829 if (!res && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
Serhiy Storchaka467ab192016-10-21 17:09:17 +03003830 _PyErr_FormatFromCause(PyExc_RuntimeError,
3831 "filesystem path bytes were not correctly encoded with '%s'. "
Steve Dowercc16be82016-09-08 10:35:16 -07003832 "Please report this at http://bugs.python.org/issue27781",
3833 Py_FileSystemDefaultEncoding);
Steve Dowercc16be82016-09-08 10:35:16 -07003834 }
3835#endif
3836 return res;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003837 }
3838 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003839 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003840 }
Victor Stinnerad158722010-10-27 00:25:46 +00003841#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003842}
3843
Martin v. Löwis011e8422009-05-05 04:43:17 +00003844
3845int
3846PyUnicode_FSConverter(PyObject* arg, void* addr)
3847{
Brett Cannonec6ce872016-09-06 15:50:29 -07003848 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003849 PyObject *output = NULL;
3850 Py_ssize_t size;
3851 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003852 if (arg == NULL) {
3853 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003854 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003855 return 1;
3856 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003857 path = PyOS_FSPath(arg);
3858 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003859 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003860 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003861 if (PyBytes_Check(path)) {
3862 output = path;
3863 }
3864 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3865 output = PyUnicode_EncodeFSDefault(path);
3866 Py_DECREF(path);
3867 if (!output) {
3868 return 0;
3869 }
3870 assert(PyBytes_Check(output));
3871 }
3872
Victor Stinner0ea2a462010-04-30 00:22:08 +00003873 size = PyBytes_GET_SIZE(output);
3874 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003875 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003876 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003877 Py_DECREF(output);
3878 return 0;
3879 }
3880 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003881 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003882}
3883
3884
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003885int
3886PyUnicode_FSDecoder(PyObject* arg, void* addr)
3887{
Brett Cannona5711202016-09-06 19:36:01 -07003888 int is_buffer = 0;
3889 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003890 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003891 if (arg == NULL) {
3892 Py_DECREF(*(PyObject**)addr);
3893 return 1;
3894 }
Brett Cannona5711202016-09-06 19:36:01 -07003895
3896 is_buffer = PyObject_CheckBuffer(arg);
3897 if (!is_buffer) {
3898 path = PyOS_FSPath(arg);
3899 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003900 return 0;
3901 }
Brett Cannona5711202016-09-06 19:36:01 -07003902 }
3903 else {
3904 path = arg;
3905 Py_INCREF(arg);
3906 }
3907
3908 if (PyUnicode_Check(path)) {
3909 if (PyUnicode_READY(path) == -1) {
3910 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003911 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003912 }
3913 output = path;
3914 }
3915 else if (PyBytes_Check(path) || is_buffer) {
3916 PyObject *path_bytes = NULL;
3917
3918 if (!PyBytes_Check(path) &&
3919 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3920 "path should be string, bytes, or os.PathLike, not %.200s",
3921 Py_TYPE(arg)->tp_name)) {
3922 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003923 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003924 }
3925 path_bytes = PyBytes_FromObject(path);
3926 Py_DECREF(path);
3927 if (!path_bytes) {
3928 return 0;
3929 }
3930 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3931 PyBytes_GET_SIZE(path_bytes));
3932 Py_DECREF(path_bytes);
3933 if (!output) {
3934 return 0;
3935 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003936 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003937 else {
3938 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003939 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003940 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003941 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003942 return 0;
3943 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003944 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003945 Py_DECREF(output);
3946 return 0;
3947 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003949 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003950 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003951 Py_DECREF(output);
3952 return 0;
3953 }
3954 *(PyObject**)addr = output;
3955 return Py_CLEANUP_SUPPORTED;
3956}
3957
3958
Martin v. Löwis5b222132007-06-10 09:51:05 +00003959char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003961{
Christian Heimesf3863112007-11-22 07:46:41 +00003962 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003963
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003964 if (!PyUnicode_Check(unicode)) {
3965 PyErr_BadArgument();
3966 return NULL;
3967 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003968 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003969 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003970
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003971 if (PyUnicode_UTF8(unicode) == NULL) {
3972 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003973 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003974 if (bytes == NULL)
3975 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003976 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3977 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003978 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003979 Py_DECREF(bytes);
3980 return NULL;
3981 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003982 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003983 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003984 PyBytes_AS_STRING(bytes),
3985 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003986 Py_DECREF(bytes);
3987 }
3988
3989 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003990 *psize = PyUnicode_UTF8_LENGTH(unicode);
3991 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003992}
3993
3994char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003997 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3998}
3999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004000Py_UNICODE *
4001PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4002{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003 const unsigned char *one_byte;
4004#if SIZEOF_WCHAR_T == 4
4005 const Py_UCS2 *two_bytes;
4006#else
4007 const Py_UCS4 *four_bytes;
4008 const Py_UCS4 *ucs4_end;
4009 Py_ssize_t num_surrogates;
4010#endif
4011 wchar_t *w;
4012 wchar_t *wchar_end;
4013
4014 if (!PyUnicode_Check(unicode)) {
4015 PyErr_BadArgument();
4016 return NULL;
4017 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004018 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004020 assert(_PyUnicode_KIND(unicode) != 0);
4021 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004022
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004023 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004024#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004025 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4026 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004027 num_surrogates = 0;
4028
4029 for (; four_bytes < ucs4_end; ++four_bytes) {
4030 if (*four_bytes > 0xFFFF)
4031 ++num_surrogates;
4032 }
4033
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004034 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4035 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4036 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037 PyErr_NoMemory();
4038 return NULL;
4039 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004040 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004042 w = _PyUnicode_WSTR(unicode);
4043 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4044 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4046 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004047 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004049 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4050 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 }
4052 else
4053 *w = *four_bytes;
4054
4055 if (w > wchar_end) {
4056 assert(0 && "Miscalculated string end");
4057 }
4058 }
4059 *w = 0;
4060#else
4061 /* sizeof(wchar_t) == 4 */
4062 Py_FatalError("Impossible unicode object state, wstr and str "
4063 "should share memory already.");
4064 return NULL;
4065#endif
4066 }
4067 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004068 if ((size_t)_PyUnicode_LENGTH(unicode) >
4069 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4070 PyErr_NoMemory();
4071 return NULL;
4072 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004073 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4074 (_PyUnicode_LENGTH(unicode) + 1));
4075 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004076 PyErr_NoMemory();
4077 return NULL;
4078 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004079 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4080 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4081 w = _PyUnicode_WSTR(unicode);
4082 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004084 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4085 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004086 for (; w < wchar_end; ++one_byte, ++w)
4087 *w = *one_byte;
4088 /* null-terminate the wstr */
4089 *w = 0;
4090 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004091 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004092#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004093 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004094 for (; w < wchar_end; ++two_bytes, ++w)
4095 *w = *two_bytes;
4096 /* null-terminate the wstr */
4097 *w = 0;
4098#else
4099 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004100 PyObject_FREE(_PyUnicode_WSTR(unicode));
4101 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004102 Py_FatalError("Impossible unicode object state, wstr "
4103 "and str should share memory already.");
4104 return NULL;
4105#endif
4106 }
4107 else {
4108 assert(0 && "This should never happen.");
4109 }
4110 }
4111 }
4112 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004113 *size = PyUnicode_WSTR_LENGTH(unicode);
4114 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004115}
4116
Alexander Belopolsky40018472011-02-26 01:02:56 +00004117Py_UNICODE *
4118PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004120 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121}
4122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004123
Alexander Belopolsky40018472011-02-26 01:02:56 +00004124Py_ssize_t
4125PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126{
4127 if (!PyUnicode_Check(unicode)) {
4128 PyErr_BadArgument();
4129 goto onError;
4130 }
4131 return PyUnicode_GET_SIZE(unicode);
4132
Benjamin Peterson29060642009-01-31 22:14:21 +00004133 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134 return -1;
4135}
4136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004137Py_ssize_t
4138PyUnicode_GetLength(PyObject *unicode)
4139{
Victor Stinner07621332012-06-16 04:53:46 +02004140 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004141 PyErr_BadArgument();
4142 return -1;
4143 }
Victor Stinner07621332012-06-16 04:53:46 +02004144 if (PyUnicode_READY(unicode) == -1)
4145 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004146 return PyUnicode_GET_LENGTH(unicode);
4147}
4148
4149Py_UCS4
4150PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4151{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004152 void *data;
4153 int kind;
4154
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004155 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4156 PyErr_BadArgument();
4157 return (Py_UCS4)-1;
4158 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004159 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004160 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004161 return (Py_UCS4)-1;
4162 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004163 data = PyUnicode_DATA(unicode);
4164 kind = PyUnicode_KIND(unicode);
4165 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004166}
4167
4168int
4169PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4170{
4171 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004172 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004173 return -1;
4174 }
Victor Stinner488fa492011-12-12 00:01:39 +01004175 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004176 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004177 PyErr_SetString(PyExc_IndexError, "string index out of range");
4178 return -1;
4179 }
Victor Stinner488fa492011-12-12 00:01:39 +01004180 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004181 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004182 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4183 PyErr_SetString(PyExc_ValueError, "character out of range");
4184 return -1;
4185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004186 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4187 index, ch);
4188 return 0;
4189}
4190
Alexander Belopolsky40018472011-02-26 01:02:56 +00004191const char *
4192PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004193{
Victor Stinner42cb4622010-09-01 19:39:01 +00004194 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004195}
4196
Victor Stinner554f3f02010-06-16 23:33:54 +00004197/* create or adjust a UnicodeDecodeError */
4198static void
4199make_decode_exception(PyObject **exceptionObject,
4200 const char *encoding,
4201 const char *input, Py_ssize_t length,
4202 Py_ssize_t startpos, Py_ssize_t endpos,
4203 const char *reason)
4204{
4205 if (*exceptionObject == NULL) {
4206 *exceptionObject = PyUnicodeDecodeError_Create(
4207 encoding, input, length, startpos, endpos, reason);
4208 }
4209 else {
4210 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4211 goto onError;
4212 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4213 goto onError;
4214 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4215 goto onError;
4216 }
4217 return;
4218
4219onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004220 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004221}
4222
Steve Dowercc16be82016-09-08 10:35:16 -07004223#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004224/* error handling callback helper:
4225 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004226 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004227 and adjust various state variables.
4228 return 0 on success, -1 on error
4229*/
4230
Alexander Belopolsky40018472011-02-26 01:02:56 +00004231static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004232unicode_decode_call_errorhandler_wchar(
4233 const char *errors, PyObject **errorHandler,
4234 const char *encoding, const char *reason,
4235 const char **input, const char **inend, Py_ssize_t *startinpos,
4236 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4237 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004238{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004239 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240
4241 PyObject *restuple = NULL;
4242 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004243 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004244 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004245 Py_ssize_t requiredsize;
4246 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004247 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004248 wchar_t *repwstr;
4249 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004250
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004251 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4252 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004253
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004255 *errorHandler = PyCodec_LookupError(errors);
4256 if (*errorHandler == NULL)
4257 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258 }
4259
Victor Stinner554f3f02010-06-16 23:33:54 +00004260 make_decode_exception(exceptionObject,
4261 encoding,
4262 *input, *inend - *input,
4263 *startinpos, *endinpos,
4264 reason);
4265 if (*exceptionObject == NULL)
4266 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004267
4268 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4269 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004270 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004272 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004273 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274 }
4275 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004277
4278 /* Copy back the bytes variables, which might have been modified by the
4279 callback */
4280 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4281 if (!inputobj)
4282 goto onError;
4283 if (!PyBytes_Check(inputobj)) {
4284 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4285 }
4286 *input = PyBytes_AS_STRING(inputobj);
4287 insize = PyBytes_GET_SIZE(inputobj);
4288 *inend = *input + insize;
4289 /* we can DECREF safely, as the exception has another reference,
4290 so the object won't go away. */
4291 Py_DECREF(inputobj);
4292
4293 if (newpos<0)
4294 newpos = insize+newpos;
4295 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004296 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004297 goto onError;
4298 }
4299
4300 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4301 if (repwstr == NULL)
4302 goto onError;
4303 /* need more space? (at least enough for what we
4304 have+the replacement+the rest of the string (starting
4305 at the new input position), so we won't have to check space
4306 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004307 requiredsize = *outpos;
4308 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4309 goto overflow;
4310 requiredsize += repwlen;
4311 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4312 goto overflow;
4313 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004314 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004315 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004316 requiredsize = 2*outsize;
4317 if (unicode_resize(output, requiredsize) < 0)
4318 goto onError;
4319 }
4320 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4321 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004322 *endinpos = newpos;
4323 *inptr = *input + newpos;
4324
4325 /* we made it! */
4326 Py_XDECREF(restuple);
4327 return 0;
4328
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004329 overflow:
4330 PyErr_SetString(PyExc_OverflowError,
4331 "decoded result is too long for a Python string");
4332
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004333 onError:
4334 Py_XDECREF(restuple);
4335 return -1;
4336}
Steve Dowercc16be82016-09-08 10:35:16 -07004337#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004338
4339static int
4340unicode_decode_call_errorhandler_writer(
4341 const char *errors, PyObject **errorHandler,
4342 const char *encoding, const char *reason,
4343 const char **input, const char **inend, Py_ssize_t *startinpos,
4344 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4345 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4346{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004347 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004348
4349 PyObject *restuple = NULL;
4350 PyObject *repunicode = NULL;
4351 Py_ssize_t insize;
4352 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004353 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004354 PyObject *inputobj = NULL;
4355
4356 if (*errorHandler == NULL) {
4357 *errorHandler = PyCodec_LookupError(errors);
4358 if (*errorHandler == NULL)
4359 goto onError;
4360 }
4361
4362 make_decode_exception(exceptionObject,
4363 encoding,
4364 *input, *inend - *input,
4365 *startinpos, *endinpos,
4366 reason);
4367 if (*exceptionObject == NULL)
4368 goto onError;
4369
4370 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4371 if (restuple == NULL)
4372 goto onError;
4373 if (!PyTuple_Check(restuple)) {
4374 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4375 goto onError;
4376 }
4377 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004378 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004379
4380 /* Copy back the bytes variables, which might have been modified by the
4381 callback */
4382 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4383 if (!inputobj)
4384 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004385 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004386 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004387 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004388 *input = PyBytes_AS_STRING(inputobj);
4389 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004390 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004391 /* we can DECREF safely, as the exception has another reference,
4392 so the object won't go away. */
4393 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004394
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004396 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004397 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004398 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004400 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004401
Victor Stinner8f674cc2013-04-17 23:02:17 +02004402 if (PyUnicode_READY(repunicode) < 0)
4403 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004404 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004405 if (replen > 1) {
4406 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004407 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004408 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4409 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4410 goto onError;
4411 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004412 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004413 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004414
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004416 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004417
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004419 Py_XDECREF(restuple);
4420 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004424 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425}
4426
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004427/* --- UTF-7 Codec -------------------------------------------------------- */
4428
Antoine Pitrou244651a2009-05-04 18:56:13 +00004429/* See RFC2152 for details. We encode conservatively and decode liberally. */
4430
4431/* Three simple macros defining base-64. */
4432
4433/* Is c a base-64 character? */
4434
4435#define IS_BASE64(c) \
4436 (((c) >= 'A' && (c) <= 'Z') || \
4437 ((c) >= 'a' && (c) <= 'z') || \
4438 ((c) >= '0' && (c) <= '9') || \
4439 (c) == '+' || (c) == '/')
4440
4441/* given that c is a base-64 character, what is its base-64 value? */
4442
4443#define FROM_BASE64(c) \
4444 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4445 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4446 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4447 (c) == '+' ? 62 : 63)
4448
4449/* What is the base-64 character of the bottom 6 bits of n? */
4450
4451#define TO_BASE64(n) \
4452 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4453
4454/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4455 * decoded as itself. We are permissive on decoding; the only ASCII
4456 * byte not decoding to itself is the + which begins a base64
4457 * string. */
4458
4459#define DECODE_DIRECT(c) \
4460 ((c) <= 127 && (c) != '+')
4461
4462/* The UTF-7 encoder treats ASCII characters differently according to
4463 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4464 * the above). See RFC2152. This array identifies these different
4465 * sets:
4466 * 0 : "Set D"
4467 * alphanumeric and '(),-./:?
4468 * 1 : "Set O"
4469 * !"#$%&*;<=>@[]^_`{|}
4470 * 2 : "whitespace"
4471 * ht nl cr sp
4472 * 3 : special (must be base64 encoded)
4473 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4474 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004475
Tim Petersced69f82003-09-16 20:30:58 +00004476static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004477char utf7_category[128] = {
4478/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4479 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4480/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4481 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4482/* sp ! " # $ % & ' ( ) * + , - . / */
4483 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4484/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4486/* @ A B C D E F G H I J K L M N O */
4487 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4488/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4490/* ` a b c d e f g h i j k l m n o */
4491 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4492/* p q r s t u v w x y z { | } ~ del */
4493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004494};
4495
Antoine Pitrou244651a2009-05-04 18:56:13 +00004496/* ENCODE_DIRECT: this character should be encoded as itself. The
4497 * answer depends on whether we are encoding set O as itself, and also
4498 * on whether we are encoding whitespace as itself. RFC2152 makes it
4499 * clear that the answers to these questions vary between
4500 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004501
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502#define ENCODE_DIRECT(c, directO, directWS) \
4503 ((c) < 128 && (c) > 0 && \
4504 ((utf7_category[(c)] == 0) || \
4505 (directWS && (utf7_category[(c)] == 2)) || \
4506 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004507
Alexander Belopolsky40018472011-02-26 01:02:56 +00004508PyObject *
4509PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004510 Py_ssize_t size,
4511 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004512{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004513 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4514}
4515
Antoine Pitrou244651a2009-05-04 18:56:13 +00004516/* The decoder. The only state we preserve is our read position,
4517 * i.e. how many characters we have consumed. So if we end in the
4518 * middle of a shift sequence we have to back off the read position
4519 * and the output to the beginning of the sequence, otherwise we lose
4520 * all the shift state (seen bits, number of bits seen, high
4521 * surrogate). */
4522
Alexander Belopolsky40018472011-02-26 01:02:56 +00004523PyObject *
4524PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004525 Py_ssize_t size,
4526 const char *errors,
4527 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004528{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004530 Py_ssize_t startinpos;
4531 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004533 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004534 const char *errmsg = "";
4535 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004536 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004537 unsigned int base64bits = 0;
4538 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004539 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004540 PyObject *errorHandler = NULL;
4541 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004543 if (size == 0) {
4544 if (consumed)
4545 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004546 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004547 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004548
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004549 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004550 _PyUnicodeWriter_Init(&writer);
4551 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004552
4553 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554 e = s + size;
4555
4556 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004557 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004558 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004559 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004560
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 if (inShift) { /* in a base-64 section */
4562 if (IS_BASE64(ch)) { /* consume a base-64 character */
4563 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4564 base64bits += 6;
4565 s++;
4566 if (base64bits >= 16) {
4567 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004568 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569 base64bits -= 16;
4570 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004571 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 if (surrogate) {
4573 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004574 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4575 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004576 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004577 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004579 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580 }
4581 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004582 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004583 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 }
4586 }
Victor Stinner551ac952011-11-29 22:58:13 +01004587 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004588 /* first surrogate */
4589 surrogate = outCh;
4590 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004591 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004592 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004593 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004594 }
4595 }
4596 }
4597 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004599 if (base64bits > 0) { /* left-over bits */
4600 if (base64bits >= 6) {
4601 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004602 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603 errmsg = "partial character in shift sequence";
4604 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004605 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 else {
4607 /* Some bits remain; they should be zero */
4608 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004609 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004610 errmsg = "non-zero padding bits in shift sequence";
4611 goto utf7Error;
4612 }
4613 }
4614 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004615 if (surrogate && DECODE_DIRECT(ch)) {
4616 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4617 goto onError;
4618 }
4619 surrogate = 0;
4620 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004621 /* '-' is absorbed; other terminating
4622 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004623 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004624 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004625 }
4626 }
4627 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004628 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004629 s++; /* consume '+' */
4630 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004631 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004632 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004633 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 }
4635 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004636 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004637 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004638 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004639 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004640 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004641 }
4642 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004643 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004644 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004645 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004646 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004647 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004648 else {
4649 startinpos = s-starts;
4650 s++;
4651 errmsg = "unexpected special character";
4652 goto utf7Error;
4653 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004654 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004655utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004656 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004657 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004658 errors, &errorHandler,
4659 "utf7", errmsg,
4660 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004661 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004662 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004663 }
4664
Antoine Pitrou244651a2009-05-04 18:56:13 +00004665 /* end of string */
4666
4667 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4668 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004669 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004670 if (surrogate ||
4671 (base64bits >= 6) ||
4672 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004674 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004675 errors, &errorHandler,
4676 "utf7", "unterminated shift sequence",
4677 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004678 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004679 goto onError;
4680 if (s < e)
4681 goto restart;
4682 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004683 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004684
4685 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004686 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004687 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004688 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004689 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004690 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004691 writer.kind, writer.data, shiftOutStart);
4692 Py_XDECREF(errorHandler);
4693 Py_XDECREF(exc);
4694 _PyUnicodeWriter_Dealloc(&writer);
4695 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004696 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004697 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004698 }
4699 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004700 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004701 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004702 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004703
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004704 Py_XDECREF(errorHandler);
4705 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004706 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004707
Benjamin Peterson29060642009-01-31 22:14:21 +00004708 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004709 Py_XDECREF(errorHandler);
4710 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004711 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004712 return NULL;
4713}
4714
4715
Alexander Belopolsky40018472011-02-26 01:02:56 +00004716PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004717_PyUnicode_EncodeUTF7(PyObject *str,
4718 int base64SetO,
4719 int base64WhiteSpace,
4720 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004721{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004722 int kind;
4723 void *data;
4724 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004725 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004726 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004727 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004728 unsigned int base64bits = 0;
4729 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004730 char * out;
4731 char * start;
4732
Benjamin Petersonbac79492012-01-14 13:34:47 -05004733 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004734 return NULL;
4735 kind = PyUnicode_KIND(str);
4736 data = PyUnicode_DATA(str);
4737 len = PyUnicode_GET_LENGTH(str);
4738
4739 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004740 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004741
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004742 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004743 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004744 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004745 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004746 if (v == NULL)
4747 return NULL;
4748
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004749 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004750 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004751 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004752
Antoine Pitrou244651a2009-05-04 18:56:13 +00004753 if (inShift) {
4754 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4755 /* shifting out */
4756 if (base64bits) { /* output remaining bits */
4757 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4758 base64buffer = 0;
4759 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004760 }
4761 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004762 /* Characters not in the BASE64 set implicitly unshift the sequence
4763 so no '-' is required, except if the character is itself a '-' */
4764 if (IS_BASE64(ch) || ch == '-') {
4765 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004766 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004767 *out++ = (char) ch;
4768 }
4769 else {
4770 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004771 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004772 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004773 else { /* not in a shift sequence */
4774 if (ch == '+') {
4775 *out++ = '+';
4776 *out++ = '-';
4777 }
4778 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4779 *out++ = (char) ch;
4780 }
4781 else {
4782 *out++ = '+';
4783 inShift = 1;
4784 goto encode_char;
4785 }
4786 }
4787 continue;
4788encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004789 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004790 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004791
Antoine Pitrou244651a2009-05-04 18:56:13 +00004792 /* code first surrogate */
4793 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004794 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004795 while (base64bits >= 6) {
4796 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4797 base64bits -= 6;
4798 }
4799 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004800 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004802 base64bits += 16;
4803 base64buffer = (base64buffer << 16) | ch;
4804 while (base64bits >= 6) {
4805 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4806 base64bits -= 6;
4807 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004808 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004809 if (base64bits)
4810 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4811 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004812 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004813 if (_PyBytes_Resize(&v, out - start) < 0)
4814 return NULL;
4815 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004816}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004817PyObject *
4818PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4819 Py_ssize_t size,
4820 int base64SetO,
4821 int base64WhiteSpace,
4822 const char *errors)
4823{
4824 PyObject *result;
4825 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4826 if (tmp == NULL)
4827 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004828 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004829 base64WhiteSpace, errors);
4830 Py_DECREF(tmp);
4831 return result;
4832}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004833
Antoine Pitrou244651a2009-05-04 18:56:13 +00004834#undef IS_BASE64
4835#undef FROM_BASE64
4836#undef TO_BASE64
4837#undef DECODE_DIRECT
4838#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004839
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840/* --- UTF-8 Codec -------------------------------------------------------- */
4841
Alexander Belopolsky40018472011-02-26 01:02:56 +00004842PyObject *
4843PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004844 Py_ssize_t size,
4845 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846{
Walter Dörwald69652032004-09-07 20:24:22 +00004847 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4848}
4849
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004850#include "stringlib/asciilib.h"
4851#include "stringlib/codecs.h"
4852#include "stringlib/undef.h"
4853
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004854#include "stringlib/ucs1lib.h"
4855#include "stringlib/codecs.h"
4856#include "stringlib/undef.h"
4857
4858#include "stringlib/ucs2lib.h"
4859#include "stringlib/codecs.h"
4860#include "stringlib/undef.h"
4861
4862#include "stringlib/ucs4lib.h"
4863#include "stringlib/codecs.h"
4864#include "stringlib/undef.h"
4865
Antoine Pitrouab868312009-01-10 15:40:25 +00004866/* Mask to quickly check whether a C 'long' contains a
4867 non-ASCII, UTF8-encoded char. */
4868#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004869# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004870#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004871# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004872#else
4873# error C 'long' size should be either 4 or 8!
4874#endif
4875
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004876static Py_ssize_t
4877ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004878{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004879 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004880 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004881
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004882 /*
4883 * Issue #17237: m68k is a bit different from most architectures in
4884 * that objects do not use "natural alignment" - for example, int and
4885 * long are only aligned at 2-byte boundaries. Therefore the assert()
4886 * won't work; also, tests have shown that skipping the "optimised
4887 * version" will even speed up m68k.
4888 */
4889#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004890#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004891 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4892 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004893 /* Fast path, see in STRINGLIB(utf8_decode) for
4894 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004895 /* Help allocation */
4896 const char *_p = p;
4897 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004898 while (_p < aligned_end) {
4899 unsigned long value = *(const unsigned long *) _p;
4900 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004901 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004902 *((unsigned long *)q) = value;
4903 _p += SIZEOF_LONG;
4904 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004905 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004906 p = _p;
4907 while (p < end) {
4908 if ((unsigned char)*p & 0x80)
4909 break;
4910 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004912 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004915#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004916 while (p < end) {
4917 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4918 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004919 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004920 /* Help allocation */
4921 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004922 while (_p < aligned_end) {
4923 unsigned long value = *(unsigned long *) _p;
4924 if (value & ASCII_CHAR_MASK)
4925 break;
4926 _p += SIZEOF_LONG;
4927 }
4928 p = _p;
4929 if (_p == end)
4930 break;
4931 }
4932 if ((unsigned char)*p & 0x80)
4933 break;
4934 ++p;
4935 }
4936 memcpy(dest, start, p - start);
4937 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938}
Antoine Pitrouab868312009-01-10 15:40:25 +00004939
Victor Stinner785938e2011-12-11 20:09:03 +01004940PyObject *
4941PyUnicode_DecodeUTF8Stateful(const char *s,
4942 Py_ssize_t size,
4943 const char *errors,
4944 Py_ssize_t *consumed)
4945{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004946 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004947 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004948 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004949
4950 Py_ssize_t startinpos;
4951 Py_ssize_t endinpos;
4952 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004953 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004954 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004955 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004956
4957 if (size == 0) {
4958 if (consumed)
4959 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004960 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004961 }
4962
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004963 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4964 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004965 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004966 *consumed = 1;
4967 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004968 }
4969
Victor Stinner8f674cc2013-04-17 23:02:17 +02004970 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004971 writer.min_length = size;
4972 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004973 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004974
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004975 writer.pos = ascii_decode(s, end, writer.data);
4976 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004977 while (s < end) {
4978 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004979 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004980
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004981 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004982 if (PyUnicode_IS_ASCII(writer.buffer))
4983 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004984 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004985 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004986 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004987 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004988 } else {
4989 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004990 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004991 }
4992
4993 switch (ch) {
4994 case 0:
4995 if (s == end || consumed)
4996 goto End;
4997 errmsg = "unexpected end of data";
4998 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004999 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000 break;
5001 case 1:
5002 errmsg = "invalid start byte";
5003 startinpos = s - starts;
5004 endinpos = startinpos + 1;
5005 break;
5006 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005007 case 3:
5008 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 errmsg = "invalid continuation byte";
5010 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005011 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012 break;
5013 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005014 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005015 goto onError;
5016 continue;
5017 }
5018
Victor Stinner1d65d912015-10-05 13:43:50 +02005019 if (error_handler == _Py_ERROR_UNKNOWN)
5020 error_handler = get_error_handler(errors);
5021
5022 switch (error_handler) {
5023 case _Py_ERROR_IGNORE:
5024 s += (endinpos - startinpos);
5025 break;
5026
5027 case _Py_ERROR_REPLACE:
5028 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5029 goto onError;
5030 s += (endinpos - startinpos);
5031 break;
5032
5033 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005034 {
5035 Py_ssize_t i;
5036
Victor Stinner1d65d912015-10-05 13:43:50 +02005037 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5038 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005039 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005040 ch = (Py_UCS4)(unsigned char)(starts[i]);
5041 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5042 ch + 0xdc00);
5043 writer.pos++;
5044 }
5045 s += (endinpos - startinpos);
5046 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005047 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005048
5049 default:
5050 if (unicode_decode_call_errorhandler_writer(
5051 errors, &error_handler_obj,
5052 "utf-8", errmsg,
5053 &starts, &end, &startinpos, &endinpos, &exc, &s,
5054 &writer))
5055 goto onError;
5056 }
Victor Stinner785938e2011-12-11 20:09:03 +01005057 }
5058
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005059End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005060 if (consumed)
5061 *consumed = s - starts;
5062
Victor Stinner1d65d912015-10-05 13:43:50 +02005063 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005064 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005065 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005066
5067onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005068 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005069 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005070 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005071 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005072}
5073
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005074#ifdef __APPLE__
5075
5076/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005077 used to decode the command line arguments on Mac OS X.
5078
5079 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005080 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005081
5082wchar_t*
5083_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5084{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005085 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005086 wchar_t *unicode;
5087 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005088
5089 /* Note: size will always be longer than the resulting Unicode
5090 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005091 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005092 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005093 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005094 if (!unicode)
5095 return NULL;
5096
5097 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005098 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005099 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005100 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005101 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005102#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005103 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005104#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005105 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005106#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005107 if (ch > 0xFF) {
5108#if SIZEOF_WCHAR_T == 4
5109 assert(0);
5110#else
5111 assert(Py_UNICODE_IS_SURROGATE(ch));
5112 /* compute and append the two surrogates: */
5113 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5114 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5115#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005116 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005117 else {
5118 if (!ch && s == e)
5119 break;
5120 /* surrogateescape */
5121 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5122 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005123 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005124 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005125 return unicode;
5126}
5127
5128#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005130/* Primary internal function which creates utf8 encoded bytes objects.
5131
5132 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005133 and allocate exactly as much space needed at the end. Else allocate the
5134 maximum possible needed (4 result bytes per Unicode character), and return
5135 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005136*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005137PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005138_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139{
Victor Stinner6099a032011-12-18 14:22:26 +01005140 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005141 void *data;
5142 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005144 if (!PyUnicode_Check(unicode)) {
5145 PyErr_BadArgument();
5146 return NULL;
5147 }
5148
5149 if (PyUnicode_READY(unicode) == -1)
5150 return NULL;
5151
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005152 if (PyUnicode_UTF8(unicode))
5153 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5154 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005155
5156 kind = PyUnicode_KIND(unicode);
5157 data = PyUnicode_DATA(unicode);
5158 size = PyUnicode_GET_LENGTH(unicode);
5159
Benjamin Petersonead6b532011-12-20 17:23:42 -06005160 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005161 default:
5162 assert(0);
5163 case PyUnicode_1BYTE_KIND:
5164 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5165 assert(!PyUnicode_IS_ASCII(unicode));
5166 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5167 case PyUnicode_2BYTE_KIND:
5168 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5169 case PyUnicode_4BYTE_KIND:
5170 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005171 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172}
5173
Alexander Belopolsky40018472011-02-26 01:02:56 +00005174PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005175PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5176 Py_ssize_t size,
5177 const char *errors)
5178{
5179 PyObject *v, *unicode;
5180
5181 unicode = PyUnicode_FromUnicode(s, size);
5182 if (unicode == NULL)
5183 return NULL;
5184 v = _PyUnicode_AsUTF8String(unicode, errors);
5185 Py_DECREF(unicode);
5186 return v;
5187}
5188
5189PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005190PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005192 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193}
5194
Walter Dörwald41980ca2007-08-16 21:55:45 +00005195/* --- UTF-32 Codec ------------------------------------------------------- */
5196
5197PyObject *
5198PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005199 Py_ssize_t size,
5200 const char *errors,
5201 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005202{
5203 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5204}
5205
5206PyObject *
5207PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 Py_ssize_t size,
5209 const char *errors,
5210 int *byteorder,
5211 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005212{
5213 const char *starts = s;
5214 Py_ssize_t startinpos;
5215 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005216 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005217 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005218 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005219 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005220 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005221 PyObject *errorHandler = NULL;
5222 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005223
Walter Dörwald41980ca2007-08-16 21:55:45 +00005224 q = (unsigned char *)s;
5225 e = q + size;
5226
5227 if (byteorder)
5228 bo = *byteorder;
5229
5230 /* Check for BOM marks (U+FEFF) in the input and adjust current
5231 byte order setting accordingly. In native mode, the leading BOM
5232 mark is skipped, in all other modes, it is copied to the output
5233 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005234 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005235 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005236 if (bom == 0x0000FEFF) {
5237 bo = -1;
5238 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005240 else if (bom == 0xFFFE0000) {
5241 bo = 1;
5242 q += 4;
5243 }
5244 if (byteorder)
5245 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005246 }
5247
Victor Stinnere64322e2012-10-30 23:12:47 +01005248 if (q == e) {
5249 if (consumed)
5250 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005251 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005252 }
5253
Victor Stinnere64322e2012-10-30 23:12:47 +01005254#ifdef WORDS_BIGENDIAN
5255 le = bo < 0;
5256#else
5257 le = bo <= 0;
5258#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005259 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005260
Victor Stinner8f674cc2013-04-17 23:02:17 +02005261 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005262 writer.min_length = (e - q + 3) / 4;
5263 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005264 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005265
Victor Stinnere64322e2012-10-30 23:12:47 +01005266 while (1) {
5267 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005268 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005269
Victor Stinnere64322e2012-10-30 23:12:47 +01005270 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005271 enum PyUnicode_Kind kind = writer.kind;
5272 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005273 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005274 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005275 if (le) {
5276 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005277 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005278 if (ch > maxch)
5279 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005280 if (kind != PyUnicode_1BYTE_KIND &&
5281 Py_UNICODE_IS_SURROGATE(ch))
5282 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005283 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005284 q += 4;
5285 } while (q <= last);
5286 }
5287 else {
5288 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005289 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005290 if (ch > maxch)
5291 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005292 if (kind != PyUnicode_1BYTE_KIND &&
5293 Py_UNICODE_IS_SURROGATE(ch))
5294 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005295 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005296 q += 4;
5297 } while (q <= last);
5298 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005299 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005300 }
5301
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005302 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005303 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005304 startinpos = ((const char *)q) - starts;
5305 endinpos = startinpos + 4;
5306 }
5307 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005308 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005309 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005310 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005312 startinpos = ((const char *)q) - starts;
5313 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005314 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005315 else {
5316 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005317 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005318 goto onError;
5319 q += 4;
5320 continue;
5321 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005322 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005323 startinpos = ((const char *)q) - starts;
5324 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005325 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005326
5327 /* The remaining input chars are ignored if the callback
5328 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005329 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005330 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005331 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005332 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005333 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005335 }
5336
Walter Dörwald41980ca2007-08-16 21:55:45 +00005337 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005339
Walter Dörwald41980ca2007-08-16 21:55:45 +00005340 Py_XDECREF(errorHandler);
5341 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005342 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005343
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005345 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005346 Py_XDECREF(errorHandler);
5347 Py_XDECREF(exc);
5348 return NULL;
5349}
5350
5351PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005352_PyUnicode_EncodeUTF32(PyObject *str,
5353 const char *errors,
5354 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005355{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005356 enum PyUnicode_Kind kind;
5357 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005358 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005359 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005360 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005361#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005362 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005363#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005364 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005365#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005366 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005367 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005368 PyObject *errorHandler = NULL;
5369 PyObject *exc = NULL;
5370 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005371
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005372 if (!PyUnicode_Check(str)) {
5373 PyErr_BadArgument();
5374 return NULL;
5375 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005376 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005377 return NULL;
5378 kind = PyUnicode_KIND(str);
5379 data = PyUnicode_DATA(str);
5380 len = PyUnicode_GET_LENGTH(str);
5381
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005382 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005383 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005384 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005385 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005386 if (v == NULL)
5387 return NULL;
5388
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005389 /* output buffer is 4-bytes aligned */
5390 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005391 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005392 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005393 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005394 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005395 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005396
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005397 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005398 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005399 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005400 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005401 else
5402 encoding = "utf-32";
5403
5404 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005405 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5406 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005407 }
5408
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005409 pos = 0;
5410 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005411 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005412
5413 if (kind == PyUnicode_2BYTE_KIND) {
5414 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5415 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005416 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005417 else {
5418 assert(kind == PyUnicode_4BYTE_KIND);
5419 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5420 &out, native_ordering);
5421 }
5422 if (pos == len)
5423 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005424
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005425 rep = unicode_encode_call_errorhandler(
5426 errors, &errorHandler,
5427 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005428 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005429 if (!rep)
5430 goto error;
5431
5432 if (PyBytes_Check(rep)) {
5433 repsize = PyBytes_GET_SIZE(rep);
5434 if (repsize & 3) {
5435 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005436 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005437 "surrogates not allowed");
5438 goto error;
5439 }
5440 moreunits = repsize / 4;
5441 }
5442 else {
5443 assert(PyUnicode_Check(rep));
5444 if (PyUnicode_READY(rep) < 0)
5445 goto error;
5446 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5447 if (!PyUnicode_IS_ASCII(rep)) {
5448 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005449 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005450 "surrogates not allowed");
5451 goto error;
5452 }
5453 }
5454
5455 /* four bytes are reserved for each surrogate */
5456 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005457 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005458 Py_ssize_t morebytes = 4 * (moreunits - 1);
5459 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5460 /* integer overflow */
5461 PyErr_NoMemory();
5462 goto error;
5463 }
5464 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5465 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005466 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005467 }
5468
5469 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005470 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005471 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005472 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005473 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005474 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5475 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005476 }
5477
5478 Py_CLEAR(rep);
5479 }
5480
5481 /* Cut back to size actually needed. This is necessary for, for example,
5482 encoding of a string containing isolated surrogates and the 'ignore'
5483 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005484 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 if (nsize != PyBytes_GET_SIZE(v))
5486 _PyBytes_Resize(&v, nsize);
5487 Py_XDECREF(errorHandler);
5488 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005489 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005490 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005491 error:
5492 Py_XDECREF(rep);
5493 Py_XDECREF(errorHandler);
5494 Py_XDECREF(exc);
5495 Py_XDECREF(v);
5496 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005497}
5498
Alexander Belopolsky40018472011-02-26 01:02:56 +00005499PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005500PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5501 Py_ssize_t size,
5502 const char *errors,
5503 int byteorder)
5504{
5505 PyObject *result;
5506 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5507 if (tmp == NULL)
5508 return NULL;
5509 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5510 Py_DECREF(tmp);
5511 return result;
5512}
5513
5514PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005515PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005516{
Victor Stinnerb960b342011-11-20 19:12:52 +01005517 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005518}
5519
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520/* --- UTF-16 Codec ------------------------------------------------------- */
5521
Tim Peters772747b2001-08-09 22:21:55 +00005522PyObject *
5523PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005524 Py_ssize_t size,
5525 const char *errors,
5526 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527{
Walter Dörwald69652032004-09-07 20:24:22 +00005528 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5529}
5530
5531PyObject *
5532PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005533 Py_ssize_t size,
5534 const char *errors,
5535 int *byteorder,
5536 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005537{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005538 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005539 Py_ssize_t startinpos;
5540 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005541 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005542 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005543 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005544 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005545 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005546 PyObject *errorHandler = NULL;
5547 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005548 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549
Tim Peters772747b2001-08-09 22:21:55 +00005550 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005551 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552
5553 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005554 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005556 /* Check for BOM marks (U+FEFF) in the input and adjust current
5557 byte order setting accordingly. In native mode, the leading BOM
5558 mark is skipped, in all other modes, it is copied to the output
5559 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005560 if (bo == 0 && size >= 2) {
5561 const Py_UCS4 bom = (q[1] << 8) | q[0];
5562 if (bom == 0xFEFF) {
5563 q += 2;
5564 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005565 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005566 else if (bom == 0xFFFE) {
5567 q += 2;
5568 bo = 1;
5569 }
5570 if (byteorder)
5571 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573
Antoine Pitrou63065d72012-05-15 23:48:04 +02005574 if (q == e) {
5575 if (consumed)
5576 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005577 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005578 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005579
Christian Heimes743e0cd2012-10-17 23:52:17 +02005580#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005581 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005582 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005583#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005584 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005585 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005586#endif
Tim Peters772747b2001-08-09 22:21:55 +00005587
Antoine Pitrou63065d72012-05-15 23:48:04 +02005588 /* Note: size will always be longer than the resulting Unicode
5589 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005590 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005591 writer.min_length = (e - q + 1) / 2;
5592 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005593 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005594
Antoine Pitrou63065d72012-05-15 23:48:04 +02005595 while (1) {
5596 Py_UCS4 ch = 0;
5597 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005598 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005599 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005600 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005601 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005602 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005603 native_ordering);
5604 else
5605 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005606 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005607 native_ordering);
5608 } else if (kind == PyUnicode_2BYTE_KIND) {
5609 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005610 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611 native_ordering);
5612 } else {
5613 assert(kind == PyUnicode_4BYTE_KIND);
5614 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005615 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005616 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005617 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005618 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619
Antoine Pitrou63065d72012-05-15 23:48:04 +02005620 switch (ch)
5621 {
5622 case 0:
5623 /* remaining byte at the end? (size should be even) */
5624 if (q == e || consumed)
5625 goto End;
5626 errmsg = "truncated data";
5627 startinpos = ((const char *)q) - starts;
5628 endinpos = ((const char *)e) - starts;
5629 break;
5630 /* The remaining input chars are ignored if the callback
5631 chooses to skip the input */
5632 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005633 q -= 2;
5634 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005635 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005636 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005637 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005638 endinpos = ((const char *)e) - starts;
5639 break;
5640 case 2:
5641 errmsg = "illegal encoding";
5642 startinpos = ((const char *)q) - 2 - starts;
5643 endinpos = startinpos + 2;
5644 break;
5645 case 3:
5646 errmsg = "illegal UTF-16 surrogate";
5647 startinpos = ((const char *)q) - 4 - starts;
5648 endinpos = startinpos + 2;
5649 break;
5650 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005651 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005652 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005653 continue;
5654 }
5655
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005656 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005657 errors,
5658 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005659 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005660 &starts,
5661 (const char **)&e,
5662 &startinpos,
5663 &endinpos,
5664 &exc,
5665 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005666 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005667 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 }
5669
Antoine Pitrou63065d72012-05-15 23:48:04 +02005670End:
Walter Dörwald69652032004-09-07 20:24:22 +00005671 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005673
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005674 Py_XDECREF(errorHandler);
5675 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005676 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677
Benjamin Peterson29060642009-01-31 22:14:21 +00005678 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005679 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005680 Py_XDECREF(errorHandler);
5681 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 return NULL;
5683}
5684
Tim Peters772747b2001-08-09 22:21:55 +00005685PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005686_PyUnicode_EncodeUTF16(PyObject *str,
5687 const char *errors,
5688 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005690 enum PyUnicode_Kind kind;
5691 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005692 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005693 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005694 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005695 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005696#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005697 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005698#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005699 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005700#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005701 const char *encoding;
5702 Py_ssize_t nsize, pos;
5703 PyObject *errorHandler = NULL;
5704 PyObject *exc = NULL;
5705 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005706
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005707 if (!PyUnicode_Check(str)) {
5708 PyErr_BadArgument();
5709 return NULL;
5710 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005711 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005712 return NULL;
5713 kind = PyUnicode_KIND(str);
5714 data = PyUnicode_DATA(str);
5715 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005716
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005717 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005718 if (kind == PyUnicode_4BYTE_KIND) {
5719 const Py_UCS4 *in = (const Py_UCS4 *)data;
5720 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005721 while (in < end) {
5722 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005723 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005724 }
5725 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005726 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005727 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005728 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005729 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005730 nsize = len + pairs + (byteorder == 0);
5731 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005732 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005734 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005736 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005737 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005738 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005739 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005740 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005741 }
5742 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005743 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005744 }
Tim Peters772747b2001-08-09 22:21:55 +00005745
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005746 if (kind == PyUnicode_1BYTE_KIND) {
5747 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5748 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005749 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005750
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005751 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005752 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005753 }
5754 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005755 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005756 }
5757 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005758 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005759 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005760
5761 pos = 0;
5762 while (pos < len) {
5763 Py_ssize_t repsize, moreunits;
5764
5765 if (kind == PyUnicode_2BYTE_KIND) {
5766 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5767 &out, native_ordering);
5768 }
5769 else {
5770 assert(kind == PyUnicode_4BYTE_KIND);
5771 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5772 &out, native_ordering);
5773 }
5774 if (pos == len)
5775 break;
5776
5777 rep = unicode_encode_call_errorhandler(
5778 errors, &errorHandler,
5779 encoding, "surrogates not allowed",
5780 str, &exc, pos, pos + 1, &pos);
5781 if (!rep)
5782 goto error;
5783
5784 if (PyBytes_Check(rep)) {
5785 repsize = PyBytes_GET_SIZE(rep);
5786 if (repsize & 1) {
5787 raise_encode_exception(&exc, encoding,
5788 str, pos - 1, pos,
5789 "surrogates not allowed");
5790 goto error;
5791 }
5792 moreunits = repsize / 2;
5793 }
5794 else {
5795 assert(PyUnicode_Check(rep));
5796 if (PyUnicode_READY(rep) < 0)
5797 goto error;
5798 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5799 if (!PyUnicode_IS_ASCII(rep)) {
5800 raise_encode_exception(&exc, encoding,
5801 str, pos - 1, pos,
5802 "surrogates not allowed");
5803 goto error;
5804 }
5805 }
5806
5807 /* two bytes are reserved for each surrogate */
5808 if (moreunits > 1) {
5809 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5810 Py_ssize_t morebytes = 2 * (moreunits - 1);
5811 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5812 /* integer overflow */
5813 PyErr_NoMemory();
5814 goto error;
5815 }
5816 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5817 goto error;
5818 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5819 }
5820
5821 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005822 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005823 out += moreunits;
5824 } else /* rep is unicode */ {
5825 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5826 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5827 &out, native_ordering);
5828 }
5829
5830 Py_CLEAR(rep);
5831 }
5832
5833 /* Cut back to size actually needed. This is necessary for, for example,
5834 encoding of a string containing isolated surrogates and the 'ignore' handler
5835 is used. */
5836 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5837 if (nsize != PyBytes_GET_SIZE(v))
5838 _PyBytes_Resize(&v, nsize);
5839 Py_XDECREF(errorHandler);
5840 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005841 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005842 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005843 error:
5844 Py_XDECREF(rep);
5845 Py_XDECREF(errorHandler);
5846 Py_XDECREF(exc);
5847 Py_XDECREF(v);
5848 return NULL;
5849#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850}
5851
Alexander Belopolsky40018472011-02-26 01:02:56 +00005852PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005853PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5854 Py_ssize_t size,
5855 const char *errors,
5856 int byteorder)
5857{
5858 PyObject *result;
5859 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5860 if (tmp == NULL)
5861 return NULL;
5862 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5863 Py_DECREF(tmp);
5864 return result;
5865}
5866
5867PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005868PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005870 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871}
5872
5873/* --- Unicode Escape Codec ----------------------------------------------- */
5874
Fredrik Lundh06d12682001-01-24 07:59:11 +00005875static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005876
Alexander Belopolsky40018472011-02-26 01:02:56 +00005877PyObject *
5878PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005879 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005880 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005883 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005885 PyObject *errorHandler = NULL;
5886 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005887
Victor Stinner62ec3312016-09-06 17:04:34 -07005888 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005889 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005890 }
5891 /* Escaped strings will always be longer than the resulting
5892 Unicode string, so we start with size here and then reduce the
5893 length after conversion to the true value.
5894 (but if the error callback returns a long replacement string
5895 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005896 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005897 writer.min_length = size;
5898 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5899 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005900 }
5901
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 end = s + size;
5903 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005904 unsigned char c = (unsigned char) *s++;
5905 Py_UCS4 ch;
5906 int count;
5907 Py_ssize_t startinpos;
5908 Py_ssize_t endinpos;
5909 const char *message;
5910
5911#define WRITE_ASCII_CHAR(ch) \
5912 do { \
5913 assert(ch <= 127); \
5914 assert(writer.pos < writer.size); \
5915 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5916 } while(0)
5917
5918#define WRITE_CHAR(ch) \
5919 do { \
5920 if (ch <= writer.maxchar) { \
5921 assert(writer.pos < writer.size); \
5922 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5923 } \
5924 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5925 goto onError; \
5926 } \
5927 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928
5929 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005930 if (c != '\\') {
5931 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 continue;
5933 }
5934
Victor Stinner62ec3312016-09-06 17:04:34 -07005935 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005937 if (s >= end) {
5938 message = "\\ at end of string";
5939 goto error;
5940 }
5941 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005942
Victor Stinner62ec3312016-09-06 17:04:34 -07005943 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005944 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005947 case '\n': continue;
5948 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5949 case '\'': WRITE_ASCII_CHAR('\''); continue;
5950 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5951 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005952 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005953 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5954 case 't': WRITE_ASCII_CHAR('\t'); continue;
5955 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5956 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005957 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005958 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005959 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005960 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 case '0': case '1': case '2': case '3':
5964 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005965 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005966 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005967 ch = (ch<<3) + *s++ - '0';
5968 if (s < end && '0' <= *s && *s <= '7') {
5969 ch = (ch<<3) + *s++ - '0';
5970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005972 WRITE_CHAR(ch);
5973 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974
Benjamin Peterson29060642009-01-31 22:14:21 +00005975 /* hex escapes */
5976 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07005978 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005979 message = "truncated \\xXX escape";
5980 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981
Benjamin Peterson29060642009-01-31 22:14:21 +00005982 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07005984 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005985 message = "truncated \\uXXXX escape";
5986 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005989 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07005990 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005991 message = "truncated \\UXXXXXXXX escape";
5992 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07005993 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02005994 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07005995 ch <<= 4;
5996 if (c >= '0' && c <= '9') {
5997 ch += c - '0';
5998 }
5999 else if (c >= 'a' && c <= 'f') {
6000 ch += c - ('a' - 10);
6001 }
6002 else if (c >= 'A' && c <= 'F') {
6003 ch += c - ('A' - 10);
6004 }
6005 else {
6006 break;
6007 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006008 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006009 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006010 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006011 }
6012
6013 /* when we get here, ch is a 32-bit unicode character */
6014 if (ch > MAX_UNICODE) {
6015 message = "illegal Unicode character";
6016 goto error;
6017 }
6018
6019 WRITE_CHAR(ch);
6020 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006021
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006023 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006024 if (ucnhash_CAPI == NULL) {
6025 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006026 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6027 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006028 if (ucnhash_CAPI == NULL) {
6029 PyErr_SetString(
6030 PyExc_UnicodeError,
6031 "\\N escapes not supported (can't load unicodedata module)"
6032 );
6033 goto onError;
6034 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006035 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006036
6037 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006038 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006039 const char *start = ++s;
6040 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006041 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006042 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006043 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006044 namelen = s - start;
6045 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006046 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006047 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006048 ch = 0xffffffff; /* in case 'getcode' messes up */
6049 if (namelen <= INT_MAX &&
6050 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6051 &ch, 0)) {
6052 assert(ch <= MAX_UNICODE);
6053 WRITE_CHAR(ch);
6054 continue;
6055 }
6056 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006057 }
6058 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006059 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006060
6061 default:
R David Murray110b6fe2016-09-08 15:34:08 -04006062 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6063 "invalid escape sequence '\\%c'", c) < 0)
6064 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006065 WRITE_ASCII_CHAR('\\');
6066 WRITE_CHAR(c);
6067 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006069
6070 error:
6071 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006072 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006073 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006074 errors, &errorHandler,
6075 "unicodeescape", message,
6076 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006077 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006078 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006079 }
6080 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6081 goto onError;
6082 }
6083
6084#undef WRITE_ASCII_CHAR
6085#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006087
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006088 Py_XDECREF(errorHandler);
6089 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006090 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006091
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006093 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006094 Py_XDECREF(errorHandler);
6095 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 return NULL;
6097}
6098
6099/* Return a Unicode-Escape string version of the Unicode object.
6100
6101 If quotes is true, the string is enclosed in u"" or u'' quotes as
6102 appropriate.
6103
6104*/
6105
Alexander Belopolsky40018472011-02-26 01:02:56 +00006106PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006107PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006109 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006110 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006112 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006113 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006114 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115
Ezio Melottie7f90372012-10-05 03:33:31 +03006116 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006117 escape.
6118
Ezio Melottie7f90372012-10-05 03:33:31 +03006119 For UCS1 strings it's '\xxx', 4 bytes per source character.
6120 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6121 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006122 */
6123
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006124 if (!PyUnicode_Check(unicode)) {
6125 PyErr_BadArgument();
6126 return NULL;
6127 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006128 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006129 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006130 }
Victor Stinner358af132015-10-12 22:36:57 +02006131
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006132 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006133 if (len == 0) {
6134 return PyBytes_FromStringAndSize(NULL, 0);
6135 }
6136
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006137 kind = PyUnicode_KIND(unicode);
6138 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006139 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6140 bytes, and 1 byte characters 4. */
6141 expandsize = kind * 2 + 2;
6142 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) {
6143 return PyErr_NoMemory();
6144 }
6145 repr = PyBytes_FromStringAndSize(NULL, 2 + expandsize * len + 1);
6146 if (repr == NULL) {
6147 return NULL;
6148 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006149
Victor Stinner62ec3312016-09-06 17:04:34 -07006150 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006151 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006152 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006153
Victor Stinner62ec3312016-09-06 17:04:34 -07006154 /* U+0000-U+00ff range */
6155 if (ch < 0x100) {
6156 if (ch >= ' ' && ch < 127) {
6157 if (ch != '\\') {
6158 /* Copy printable US ASCII as-is */
6159 *p++ = (char) ch;
6160 }
6161 /* Escape backslashes */
6162 else {
6163 *p++ = '\\';
6164 *p++ = '\\';
6165 }
6166 }
Victor Stinner358af132015-10-12 22:36:57 +02006167
Victor Stinner62ec3312016-09-06 17:04:34 -07006168 /* Map special whitespace to '\t', \n', '\r' */
6169 else if (ch == '\t') {
6170 *p++ = '\\';
6171 *p++ = 't';
6172 }
6173 else if (ch == '\n') {
6174 *p++ = '\\';
6175 *p++ = 'n';
6176 }
6177 else if (ch == '\r') {
6178 *p++ = '\\';
6179 *p++ = 'r';
6180 }
6181
6182 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6183 else {
6184 *p++ = '\\';
6185 *p++ = 'x';
6186 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6187 *p++ = Py_hexdigits[ch & 0x000F];
6188 }
Tim Petersced69f82003-09-16 20:30:58 +00006189 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006190 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6191 else if (ch < 0x10000) {
6192 /* U+0100-U+ffff */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 *p++ = '\\';
6194 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006195 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6196 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6197 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6198 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006200 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6201 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006202
Victor Stinner62ec3312016-09-06 17:04:34 -07006203 /* Make sure that the first two digits are zero */
6204 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006205 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006206 *p++ = 'U';
6207 *p++ = '0';
6208 *p++ = '0';
6209 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6210 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6211 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6212 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6213 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6214 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006215 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217
Victor Stinner62ec3312016-09-06 17:04:34 -07006218 assert(p - PyBytes_AS_STRING(repr) > 0);
6219 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6220 return NULL;
6221 }
6222 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223}
6224
Alexander Belopolsky40018472011-02-26 01:02:56 +00006225PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006226PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6227 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006229 PyObject *result;
6230 PyObject *tmp = PyUnicode_FromUnicode(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006231 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006233 }
6234
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006235 result = PyUnicode_AsUnicodeEscapeString(tmp);
6236 Py_DECREF(tmp);
6237 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238}
6239
6240/* --- Raw Unicode Escape Codec ------------------------------------------- */
6241
Alexander Belopolsky40018472011-02-26 01:02:56 +00006242PyObject *
6243PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006244 Py_ssize_t size,
6245 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006247 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006248 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006250 PyObject *errorHandler = NULL;
6251 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006252
Victor Stinner62ec3312016-09-06 17:04:34 -07006253 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006254 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006255 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006256
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 /* Escaped strings will always be longer than the resulting
6258 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006259 length after conversion to the true value. (But decoding error
6260 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006261 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006262 writer.min_length = size;
6263 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6264 goto onError;
6265 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006266
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 end = s + size;
6268 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006269 unsigned char c = (unsigned char) *s++;
6270 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006271 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006272 Py_ssize_t startinpos;
6273 Py_ssize_t endinpos;
6274 const char *message;
6275
6276#define WRITE_CHAR(ch) \
6277 do { \
6278 if (ch <= writer.maxchar) { \
6279 assert(writer.pos < writer.size); \
6280 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6281 } \
6282 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6283 goto onError; \
6284 } \
6285 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006288 if (c != '\\' || s >= end) {
6289 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006291 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006292
Victor Stinner62ec3312016-09-06 17:04:34 -07006293 c = (unsigned char) *s++;
6294 if (c == 'u') {
6295 count = 4;
6296 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006298 else if (c == 'U') {
6299 count = 8;
6300 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006301 }
6302 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006303 assert(writer.pos < writer.size);
6304 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6305 WRITE_CHAR(c);
6306 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006307 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006308 startinpos = s - starts - 2;
6309
6310 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6311 for (ch = 0; count && s < end; ++s, --count) {
6312 c = (unsigned char)*s;
6313 ch <<= 4;
6314 if (c >= '0' && c <= '9') {
6315 ch += c - '0';
6316 }
6317 else if (c >= 'a' && c <= 'f') {
6318 ch += c - ('a' - 10);
6319 }
6320 else if (c >= 'A' && c <= 'F') {
6321 ch += c - ('A' - 10);
6322 }
6323 else {
6324 break;
6325 }
6326 }
6327 if (!count) {
6328 if (ch <= MAX_UNICODE) {
6329 WRITE_CHAR(ch);
6330 continue;
6331 }
6332 message = "\\Uxxxxxxxx out of range";
6333 }
6334
6335 endinpos = s-starts;
6336 writer.min_length = end - s + writer.pos;
6337 if (unicode_decode_call_errorhandler_writer(
6338 errors, &errorHandler,
6339 "rawunicodeescape", message,
6340 &starts, &end, &startinpos, &endinpos, &exc, &s,
6341 &writer)) {
6342 goto onError;
6343 }
6344 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6345 goto onError;
6346 }
6347
6348#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006350 Py_XDECREF(errorHandler);
6351 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006352 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006353
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006355 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356 Py_XDECREF(errorHandler);
6357 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006359
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360}
6361
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006362
Alexander Belopolsky40018472011-02-26 01:02:56 +00006363PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006364PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365{
Victor Stinner62ec3312016-09-06 17:04:34 -07006366 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006368 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006369 int kind;
6370 void *data;
6371 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006373 if (!PyUnicode_Check(unicode)) {
6374 PyErr_BadArgument();
6375 return NULL;
6376 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006377 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006378 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006379 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006380 kind = PyUnicode_KIND(unicode);
6381 data = PyUnicode_DATA(unicode);
6382 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006383 if (kind == PyUnicode_1BYTE_KIND) {
6384 return PyBytes_FromStringAndSize(data, len);
6385 }
Victor Stinner0e368262011-11-10 20:12:49 +01006386
Victor Stinner62ec3312016-09-06 17:04:34 -07006387 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6388 bytes, and 1 byte characters 4. */
6389 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006390
Victor Stinner62ec3312016-09-06 17:04:34 -07006391 if (len > PY_SSIZE_T_MAX / expandsize) {
6392 return PyErr_NoMemory();
6393 }
6394 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6395 if (repr == NULL) {
6396 return NULL;
6397 }
6398 if (len == 0) {
6399 return repr;
6400 }
6401
6402 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006403 for (pos = 0; pos < len; pos++) {
6404 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006405
Victor Stinner62ec3312016-09-06 17:04:34 -07006406 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6407 if (ch < 0x100) {
6408 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006409 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6411 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 *p++ = '\\';
6413 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006414 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6415 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6416 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6417 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6420 else {
6421 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6422 *p++ = '\\';
6423 *p++ = 'U';
6424 *p++ = '0';
6425 *p++ = '0';
6426 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6427 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6428 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6429 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6430 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6431 *p++ = Py_hexdigits[ch & 15];
6432 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006434
Victor Stinner62ec3312016-09-06 17:04:34 -07006435 assert(p > PyBytes_AS_STRING(repr));
6436 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6437 return NULL;
6438 }
6439 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440}
6441
Alexander Belopolsky40018472011-02-26 01:02:56 +00006442PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006443PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6444 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006446 PyObject *result;
6447 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6448 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006449 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006450 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6451 Py_DECREF(tmp);
6452 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453}
6454
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006455/* --- Unicode Internal Codec ------------------------------------------- */
6456
Alexander Belopolsky40018472011-02-26 01:02:56 +00006457PyObject *
6458_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006459 Py_ssize_t size,
6460 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006461{
6462 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006463 Py_ssize_t startinpos;
6464 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006465 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006466 const char *end;
6467 const char *reason;
6468 PyObject *errorHandler = NULL;
6469 PyObject *exc = NULL;
6470
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006471 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006472 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006473 1))
6474 return NULL;
6475
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006476 if (size == 0)
6477 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006478
Victor Stinner8f674cc2013-04-17 23:02:17 +02006479 _PyUnicodeWriter_Init(&writer);
6480 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6481 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006483 }
6484 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006485
Victor Stinner8f674cc2013-04-17 23:02:17 +02006486 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006487 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006488 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006489 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006490 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006491 endinpos = end-starts;
6492 reason = "truncated input";
6493 goto error;
6494 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006495 /* We copy the raw representation one byte at a time because the
6496 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006497 ((char *) &uch)[0] = s[0];
6498 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006499#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006500 ((char *) &uch)[2] = s[2];
6501 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006502#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006503 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006504#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006505 /* We have to sanity check the raw data, otherwise doom looms for
6506 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006507 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006508 endinpos = s - starts + Py_UNICODE_SIZE;
6509 reason = "illegal code point (> 0x10FFFF)";
6510 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006511 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006512#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006513 s += Py_UNICODE_SIZE;
6514#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006515 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006516 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006517 Py_UNICODE uch2;
6518 ((char *) &uch2)[0] = s[0];
6519 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006520 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006521 {
Victor Stinner551ac952011-11-29 22:58:13 +01006522 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006523 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006524 }
6525 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006526#endif
6527
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006528 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006529 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006530 continue;
6531
6532 error:
6533 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006534 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006535 errors, &errorHandler,
6536 "unicode_internal", reason,
6537 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006538 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006539 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006540 }
6541
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006542 Py_XDECREF(errorHandler);
6543 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006544 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006545
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006547 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006548 Py_XDECREF(errorHandler);
6549 Py_XDECREF(exc);
6550 return NULL;
6551}
6552
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553/* --- Latin-1 Codec ------------------------------------------------------ */
6554
Alexander Belopolsky40018472011-02-26 01:02:56 +00006555PyObject *
6556PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006557 Py_ssize_t size,
6558 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006561 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562}
6563
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006564/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006565static void
6566make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006567 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006568 PyObject *unicode,
6569 Py_ssize_t startpos, Py_ssize_t endpos,
6570 const char *reason)
6571{
6572 if (*exceptionObject == NULL) {
6573 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006574 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006575 encoding, unicode, startpos, endpos, reason);
6576 }
6577 else {
6578 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6579 goto onError;
6580 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6581 goto onError;
6582 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6583 goto onError;
6584 return;
6585 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006586 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006587 }
6588}
6589
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006590/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006591static void
6592raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006593 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006594 PyObject *unicode,
6595 Py_ssize_t startpos, Py_ssize_t endpos,
6596 const char *reason)
6597{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006598 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006599 encoding, unicode, startpos, endpos, reason);
6600 if (*exceptionObject != NULL)
6601 PyCodec_StrictErrors(*exceptionObject);
6602}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006603
6604/* error handling callback helper:
6605 build arguments, call the callback and check the arguments,
6606 put the result into newpos and return the replacement string, which
6607 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006608static PyObject *
6609unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006610 PyObject **errorHandler,
6611 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006612 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006613 Py_ssize_t startpos, Py_ssize_t endpos,
6614 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006615{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006616 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006617 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006618 PyObject *restuple;
6619 PyObject *resunicode;
6620
6621 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006623 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006625 }
6626
Benjamin Petersonbac79492012-01-14 13:34:47 -05006627 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006628 return NULL;
6629 len = PyUnicode_GET_LENGTH(unicode);
6630
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006631 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006632 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006633 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006635
6636 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006637 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006638 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006639 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006640 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006641 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006642 Py_DECREF(restuple);
6643 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006645 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006646 &resunicode, newpos)) {
6647 Py_DECREF(restuple);
6648 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006649 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006650 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6651 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6652 Py_DECREF(restuple);
6653 return NULL;
6654 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006655 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006656 *newpos = len + *newpos;
6657 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006658 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006659 Py_DECREF(restuple);
6660 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006661 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006662 Py_INCREF(resunicode);
6663 Py_DECREF(restuple);
6664 return resunicode;
6665}
6666
Alexander Belopolsky40018472011-02-26 01:02:56 +00006667static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006668unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006669 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006670 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006671{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006672 /* input state */
6673 Py_ssize_t pos=0, size;
6674 int kind;
6675 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676 /* pointer into the output */
6677 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006678 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6679 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006680 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006681 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006682 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006683 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006684 /* output object */
6685 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006686
Benjamin Petersonbac79492012-01-14 13:34:47 -05006687 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006688 return NULL;
6689 size = PyUnicode_GET_LENGTH(unicode);
6690 kind = PyUnicode_KIND(unicode);
6691 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006692 /* allocate enough for a simple encoding without
6693 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006694 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006695 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006696
6697 _PyBytesWriter_Init(&writer);
6698 str = _PyBytesWriter_Alloc(&writer, size);
6699 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006700 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006701
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006702 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006703 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006704
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006706 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006707 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006708 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006709 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006710 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006712 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006714 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006715 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006716 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006717
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006718 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006720
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006721 /* Only overallocate the buffer if it's not the last write */
6722 writer.overallocate = (collend < size);
6723
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006725 if (error_handler == _Py_ERROR_UNKNOWN)
6726 error_handler = get_error_handler(errors);
6727
6728 switch (error_handler) {
6729 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006730 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006732
6733 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006734 memset(str, '?', collend - collstart);
6735 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006736 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006737 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006738 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 break;
Victor Stinner50149202015-09-22 00:26:54 +02006740
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006741 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006742 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006743 writer.min_size -= (collend - collstart);
6744 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006745 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006746 if (str == NULL)
6747 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006748 pos = collend;
6749 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006750
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006751 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006752 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006753 writer.min_size -= (collend - collstart);
6754 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006755 unicode, collstart, collend);
6756 if (str == NULL)
6757 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006758 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 break;
Victor Stinner50149202015-09-22 00:26:54 +02006760
Victor Stinnerc3713e92015-09-29 12:32:13 +02006761 case _Py_ERROR_SURROGATEESCAPE:
6762 for (i = collstart; i < collend; ++i) {
6763 ch = PyUnicode_READ(kind, data, i);
6764 if (ch < 0xdc80 || 0xdcff < ch) {
6765 /* Not a UTF-8b surrogate */
6766 break;
6767 }
6768 *str++ = (char)(ch - 0xdc00);
6769 ++pos;
6770 }
6771 if (i >= collend)
6772 break;
6773 collstart = pos;
6774 assert(collstart != collend);
6775 /* fallback to general error handling */
6776
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006778 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6779 encoding, reason, unicode, &exc,
6780 collstart, collend, &newpos);
6781 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006783
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006784 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006785 writer.min_size -= 1;
6786
Victor Stinner6bd525b2015-10-09 13:10:05 +02006787 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006788 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006789 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006790 PyBytes_AS_STRING(rep),
6791 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006792 if (str == NULL)
6793 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006794 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006795 else {
6796 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006797
Victor Stinner6bd525b2015-10-09 13:10:05 +02006798 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006800
6801 if (PyUnicode_IS_ASCII(rep)) {
6802 /* Fast path: all characters are smaller than limit */
6803 assert(limit >= 128);
6804 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6805 str = _PyBytesWriter_WriteBytes(&writer, str,
6806 PyUnicode_DATA(rep),
6807 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006809 else {
6810 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6811
6812 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6813 if (str == NULL)
6814 goto onError;
6815
6816 /* check if there is anything unencodable in the
6817 replacement and copy it to the output */
6818 for (i = 0; repsize-->0; ++i, ++str) {
6819 ch = PyUnicode_READ_CHAR(rep, i);
6820 if (ch >= limit) {
6821 raise_encode_exception(&exc, encoding, unicode,
6822 pos, pos+1, reason);
6823 goto onError;
6824 }
6825 *str = (char)ch;
6826 }
6827 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006829 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006830 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006831 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006832
6833 /* If overallocation was disabled, ensure that it was the last
6834 write. Otherwise, we missed an optimization */
6835 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006836 }
6837 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006838
Victor Stinner50149202015-09-22 00:26:54 +02006839 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006840 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006841 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006842
6843 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006844 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006845 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006846 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006847 Py_XDECREF(exc);
6848 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006849}
6850
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006851/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006852PyObject *
6853PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006854 Py_ssize_t size,
6855 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006857 PyObject *result;
6858 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6859 if (unicode == NULL)
6860 return NULL;
6861 result = unicode_encode_ucs1(unicode, errors, 256);
6862 Py_DECREF(unicode);
6863 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864}
6865
Alexander Belopolsky40018472011-02-26 01:02:56 +00006866PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006867_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868{
6869 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 PyErr_BadArgument();
6871 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006873 if (PyUnicode_READY(unicode) == -1)
6874 return NULL;
6875 /* Fast path: if it is a one-byte string, construct
6876 bytes object directly. */
6877 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6878 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6879 PyUnicode_GET_LENGTH(unicode));
6880 /* Non-Latin-1 characters present. Defer to above function to
6881 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006882 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006883}
6884
6885PyObject*
6886PyUnicode_AsLatin1String(PyObject *unicode)
6887{
6888 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889}
6890
6891/* --- 7-bit ASCII Codec -------------------------------------------------- */
6892
Alexander Belopolsky40018472011-02-26 01:02:56 +00006893PyObject *
6894PyUnicode_DecodeASCII(const char *s,
6895 Py_ssize_t size,
6896 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006898 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006899 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006900 int kind;
6901 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006902 Py_ssize_t startinpos;
6903 Py_ssize_t endinpos;
6904 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006905 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006906 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006907 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006908 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006909
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006911 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006912
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006914 if (size == 1 && (unsigned char)s[0] < 128)
6915 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006916
Victor Stinner8f674cc2013-04-17 23:02:17 +02006917 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006918 writer.min_length = size;
6919 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006920 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006921
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006922 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006923 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006924 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006925 writer.pos = outpos;
6926 if (writer.pos == size)
6927 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006928
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006929 s += writer.pos;
6930 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006931 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006932 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006934 PyUnicode_WRITE(kind, data, writer.pos, c);
6935 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006937 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006939
6940 /* byte outsize range 0x00..0x7f: call the error handler */
6941
6942 if (error_handler == _Py_ERROR_UNKNOWN)
6943 error_handler = get_error_handler(errors);
6944
6945 switch (error_handler)
6946 {
6947 case _Py_ERROR_REPLACE:
6948 case _Py_ERROR_SURROGATEESCAPE:
6949 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006950 but we may switch to UCS2 at the first write */
6951 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6952 goto onError;
6953 kind = writer.kind;
6954 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006955
6956 if (error_handler == _Py_ERROR_REPLACE)
6957 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6958 else
6959 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6960 writer.pos++;
6961 ++s;
6962 break;
6963
6964 case _Py_ERROR_IGNORE:
6965 ++s;
6966 break;
6967
6968 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 startinpos = s-starts;
6970 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006971 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006972 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 "ascii", "ordinal not in range(128)",
6974 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006975 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006976 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006977 kind = writer.kind;
6978 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006979 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006981 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006982 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006983 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006984
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006986 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006987 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006988 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989 return NULL;
6990}
6991
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006992/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006993PyObject *
6994PyUnicode_EncodeASCII(const Py_UNICODE *p,
6995 Py_ssize_t size,
6996 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006998 PyObject *result;
6999 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7000 if (unicode == NULL)
7001 return NULL;
7002 result = unicode_encode_ucs1(unicode, errors, 128);
7003 Py_DECREF(unicode);
7004 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005}
7006
Alexander Belopolsky40018472011-02-26 01:02:56 +00007007PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007008_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009{
7010 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 PyErr_BadArgument();
7012 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007014 if (PyUnicode_READY(unicode) == -1)
7015 return NULL;
7016 /* Fast path: if it is an ASCII-only string, construct bytes object
7017 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007018 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007019 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7020 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007021 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007022}
7023
7024PyObject *
7025PyUnicode_AsASCIIString(PyObject *unicode)
7026{
7027 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028}
7029
Steve Dowercc16be82016-09-08 10:35:16 -07007030#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007031
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007032/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007033
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007034#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007035#define NEED_RETRY
7036#endif
7037
Victor Stinner3a50e702011-10-18 21:21:00 +02007038#ifndef WC_ERR_INVALID_CHARS
7039# define WC_ERR_INVALID_CHARS 0x0080
7040#endif
7041
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007042static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007043code_page_name(UINT code_page, PyObject **obj)
7044{
7045 *obj = NULL;
7046 if (code_page == CP_ACP)
7047 return "mbcs";
7048 if (code_page == CP_UTF7)
7049 return "CP_UTF7";
7050 if (code_page == CP_UTF8)
7051 return "CP_UTF8";
7052
7053 *obj = PyBytes_FromFormat("cp%u", code_page);
7054 if (*obj == NULL)
7055 return NULL;
7056 return PyBytes_AS_STRING(*obj);
7057}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007058
Victor Stinner3a50e702011-10-18 21:21:00 +02007059static DWORD
7060decode_code_page_flags(UINT code_page)
7061{
7062 if (code_page == CP_UTF7) {
7063 /* The CP_UTF7 decoder only supports flags=0 */
7064 return 0;
7065 }
7066 else
7067 return MB_ERR_INVALID_CHARS;
7068}
7069
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007070/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007071 * Decode a byte string from a Windows code page into unicode object in strict
7072 * mode.
7073 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007074 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7075 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007076 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007077static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007078decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007079 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007080 const char *in,
7081 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007082{
Victor Stinner3a50e702011-10-18 21:21:00 +02007083 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007084 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007085 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007086
7087 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007088 assert(insize > 0);
7089 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7090 if (outsize <= 0)
7091 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007092
7093 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007095 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007096 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007097 if (*v == NULL)
7098 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007099 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100 }
7101 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007102 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007103 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007104 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007106 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107 }
7108
7109 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007110 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7111 if (outsize <= 0)
7112 goto error;
7113 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007114
Victor Stinner3a50e702011-10-18 21:21:00 +02007115error:
7116 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7117 return -2;
7118 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007119 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007120}
7121
Victor Stinner3a50e702011-10-18 21:21:00 +02007122/*
7123 * Decode a byte string from a code page into unicode object with an error
7124 * handler.
7125 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007126 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007127 * UnicodeDecodeError exception and returns -1 on error.
7128 */
7129static int
7130decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007131 PyObject **v,
7132 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007133 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007134{
7135 const char *startin = in;
7136 const char *endin = in + size;
7137 const DWORD flags = decode_code_page_flags(code_page);
7138 /* Ideally, we should get reason from FormatMessage. This is the Windows
7139 2000 English version of the message. */
7140 const char *reason = "No mapping for the Unicode character exists "
7141 "in the target code page.";
7142 /* each step cannot decode more than 1 character, but a character can be
7143 represented as a surrogate pair */
7144 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007145 int insize;
7146 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 PyObject *errorHandler = NULL;
7148 PyObject *exc = NULL;
7149 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007150 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007151 DWORD err;
7152 int ret = -1;
7153
7154 assert(size > 0);
7155
7156 encoding = code_page_name(code_page, &encoding_obj);
7157 if (encoding == NULL)
7158 return -1;
7159
Victor Stinner7d00cc12014-03-17 23:08:06 +01007160 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007161 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7162 UnicodeDecodeError. */
7163 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7164 if (exc != NULL) {
7165 PyCodec_StrictErrors(exc);
7166 Py_CLEAR(exc);
7167 }
7168 goto error;
7169 }
7170
7171 if (*v == NULL) {
7172 /* Create unicode object */
7173 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7174 PyErr_NoMemory();
7175 goto error;
7176 }
Victor Stinnerab595942011-12-17 04:59:06 +01007177 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007178 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 if (*v == NULL)
7180 goto error;
7181 startout = PyUnicode_AS_UNICODE(*v);
7182 }
7183 else {
7184 /* Extend unicode object */
7185 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7186 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7187 PyErr_NoMemory();
7188 goto error;
7189 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007190 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007191 goto error;
7192 startout = PyUnicode_AS_UNICODE(*v) + n;
7193 }
7194
7195 /* Decode the byte string character per character */
7196 out = startout;
7197 while (in < endin)
7198 {
7199 /* Decode a character */
7200 insize = 1;
7201 do
7202 {
7203 outsize = MultiByteToWideChar(code_page, flags,
7204 in, insize,
7205 buffer, Py_ARRAY_LENGTH(buffer));
7206 if (outsize > 0)
7207 break;
7208 err = GetLastError();
7209 if (err != ERROR_NO_UNICODE_TRANSLATION
7210 && err != ERROR_INSUFFICIENT_BUFFER)
7211 {
7212 PyErr_SetFromWindowsErr(0);
7213 goto error;
7214 }
7215 insize++;
7216 }
7217 /* 4=maximum length of a UTF-8 sequence */
7218 while (insize <= 4 && (in + insize) <= endin);
7219
7220 if (outsize <= 0) {
7221 Py_ssize_t startinpos, endinpos, outpos;
7222
Victor Stinner7d00cc12014-03-17 23:08:06 +01007223 /* last character in partial decode? */
7224 if (in + insize >= endin && !final)
7225 break;
7226
Victor Stinner3a50e702011-10-18 21:21:00 +02007227 startinpos = in - startin;
7228 endinpos = startinpos + 1;
7229 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007230 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007231 errors, &errorHandler,
7232 encoding, reason,
7233 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007234 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007235 {
7236 goto error;
7237 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007238 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007239 }
7240 else {
7241 in += insize;
7242 memcpy(out, buffer, outsize * sizeof(wchar_t));
7243 out += outsize;
7244 }
7245 }
7246
7247 /* write a NUL character at the end */
7248 *out = 0;
7249
7250 /* Extend unicode object */
7251 outsize = out - startout;
7252 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007253 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007254 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007255 /* (in - startin) <= size and size is an int */
7256 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007257
7258error:
7259 Py_XDECREF(encoding_obj);
7260 Py_XDECREF(errorHandler);
7261 Py_XDECREF(exc);
7262 return ret;
7263}
7264
Victor Stinner3a50e702011-10-18 21:21:00 +02007265static PyObject *
7266decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007267 const char *s, Py_ssize_t size,
7268 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007269{
Victor Stinner76a31a62011-11-04 00:05:13 +01007270 PyObject *v = NULL;
7271 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007272
Victor Stinner3a50e702011-10-18 21:21:00 +02007273 if (code_page < 0) {
7274 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7275 return NULL;
7276 }
7277
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007278 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007280
Victor Stinner76a31a62011-11-04 00:05:13 +01007281 do
7282 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007283#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007284 if (size > INT_MAX) {
7285 chunk_size = INT_MAX;
7286 final = 0;
7287 done = 0;
7288 }
7289 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007290#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007291 {
7292 chunk_size = (int)size;
7293 final = (consumed == NULL);
7294 done = 1;
7295 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007296
Victor Stinner76a31a62011-11-04 00:05:13 +01007297 if (chunk_size == 0 && done) {
7298 if (v != NULL)
7299 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007300 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007301 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007302
Victor Stinner76a31a62011-11-04 00:05:13 +01007303 converted = decode_code_page_strict(code_page, &v,
7304 s, chunk_size);
7305 if (converted == -2)
7306 converted = decode_code_page_errors(code_page, &v,
7307 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007308 errors, final);
7309 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007310
7311 if (converted < 0) {
7312 Py_XDECREF(v);
7313 return NULL;
7314 }
7315
7316 if (consumed)
7317 *consumed += converted;
7318
7319 s += converted;
7320 size -= converted;
7321 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007322
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007323 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007324}
7325
Alexander Belopolsky40018472011-02-26 01:02:56 +00007326PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007327PyUnicode_DecodeCodePageStateful(int code_page,
7328 const char *s,
7329 Py_ssize_t size,
7330 const char *errors,
7331 Py_ssize_t *consumed)
7332{
7333 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7334}
7335
7336PyObject *
7337PyUnicode_DecodeMBCSStateful(const char *s,
7338 Py_ssize_t size,
7339 const char *errors,
7340 Py_ssize_t *consumed)
7341{
7342 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7343}
7344
7345PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007346PyUnicode_DecodeMBCS(const char *s,
7347 Py_ssize_t size,
7348 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007349{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007350 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7351}
7352
Victor Stinner3a50e702011-10-18 21:21:00 +02007353static DWORD
7354encode_code_page_flags(UINT code_page, const char *errors)
7355{
7356 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007357 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007358 }
7359 else if (code_page == CP_UTF7) {
7360 /* CP_UTF7 only supports flags=0 */
7361 return 0;
7362 }
7363 else {
7364 if (errors != NULL && strcmp(errors, "replace") == 0)
7365 return 0;
7366 else
7367 return WC_NO_BEST_FIT_CHARS;
7368 }
7369}
7370
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007371/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007372 * Encode a Unicode string to a Windows code page into a byte string in strict
7373 * mode.
7374 *
7375 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007376 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007377 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007378static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007379encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007380 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007381 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007382{
Victor Stinner554f3f02010-06-16 23:33:54 +00007383 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007384 BOOL *pusedDefaultChar = &usedDefaultChar;
7385 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007386 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007387 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007388 const DWORD flags = encode_code_page_flags(code_page, NULL);
7389 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007390 /* Create a substring so that we can get the UTF-16 representation
7391 of just the slice under consideration. */
7392 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007393
Martin v. Löwis3d325192011-11-04 18:23:06 +01007394 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007395
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007397 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007398 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007399 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007400
Victor Stinner2fc507f2011-11-04 20:06:39 +01007401 substring = PyUnicode_Substring(unicode, offset, offset+len);
7402 if (substring == NULL)
7403 return -1;
7404 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7405 if (p == NULL) {
7406 Py_DECREF(substring);
7407 return -1;
7408 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007409 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007410
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007411 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007412 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007413 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007414 NULL, 0,
7415 NULL, pusedDefaultChar);
7416 if (outsize <= 0)
7417 goto error;
7418 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007419 if (pusedDefaultChar && *pusedDefaultChar) {
7420 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007421 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007422 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007423
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007427 if (*outbytes == NULL) {
7428 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007429 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007430 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007431 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007432 }
7433 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007434 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007435 const Py_ssize_t n = PyBytes_Size(*outbytes);
7436 if (outsize > PY_SSIZE_T_MAX - n) {
7437 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007438 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007440 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007441 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7442 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007444 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007445 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007446 }
7447
7448 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007450 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007451 out, outsize,
7452 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007453 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007454 if (outsize <= 0)
7455 goto error;
7456 if (pusedDefaultChar && *pusedDefaultChar)
7457 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007458 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007459
Victor Stinner3a50e702011-10-18 21:21:00 +02007460error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007461 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7463 return -2;
7464 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007465 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007466}
7467
Victor Stinner3a50e702011-10-18 21:21:00 +02007468/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007469 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007470 * error handler.
7471 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007472 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 * -1 on other error.
7474 */
7475static int
7476encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007477 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007478 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007479{
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007481 Py_ssize_t pos = unicode_offset;
7482 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 /* Ideally, we should get reason from FormatMessage. This is the Windows
7484 2000 English version of the message. */
7485 const char *reason = "invalid character";
7486 /* 4=maximum length of a UTF-8 sequence */
7487 char buffer[4];
7488 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7489 Py_ssize_t outsize;
7490 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 PyObject *errorHandler = NULL;
7492 PyObject *exc = NULL;
7493 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007494 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007495 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007496 PyObject *rep;
7497 int ret = -1;
7498
7499 assert(insize > 0);
7500
7501 encoding = code_page_name(code_page, &encoding_obj);
7502 if (encoding == NULL)
7503 return -1;
7504
7505 if (errors == NULL || strcmp(errors, "strict") == 0) {
7506 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7507 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007508 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 if (exc != NULL) {
7510 PyCodec_StrictErrors(exc);
7511 Py_DECREF(exc);
7512 }
7513 Py_XDECREF(encoding_obj);
7514 return -1;
7515 }
7516
7517 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7518 pusedDefaultChar = &usedDefaultChar;
7519 else
7520 pusedDefaultChar = NULL;
7521
7522 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7523 PyErr_NoMemory();
7524 goto error;
7525 }
7526 outsize = insize * Py_ARRAY_LENGTH(buffer);
7527
7528 if (*outbytes == NULL) {
7529 /* Create string object */
7530 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7531 if (*outbytes == NULL)
7532 goto error;
7533 out = PyBytes_AS_STRING(*outbytes);
7534 }
7535 else {
7536 /* Extend string object */
7537 Py_ssize_t n = PyBytes_Size(*outbytes);
7538 if (n > PY_SSIZE_T_MAX - outsize) {
7539 PyErr_NoMemory();
7540 goto error;
7541 }
7542 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7543 goto error;
7544 out = PyBytes_AS_STRING(*outbytes) + n;
7545 }
7546
7547 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007548 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007549 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007550 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7551 wchar_t chars[2];
7552 int charsize;
7553 if (ch < 0x10000) {
7554 chars[0] = (wchar_t)ch;
7555 charsize = 1;
7556 }
7557 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007558 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7559 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007560 charsize = 2;
7561 }
7562
Victor Stinner3a50e702011-10-18 21:21:00 +02007563 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007564 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007565 buffer, Py_ARRAY_LENGTH(buffer),
7566 NULL, pusedDefaultChar);
7567 if (outsize > 0) {
7568 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7569 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007570 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007571 memcpy(out, buffer, outsize);
7572 out += outsize;
7573 continue;
7574 }
7575 }
7576 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7577 PyErr_SetFromWindowsErr(0);
7578 goto error;
7579 }
7580
Victor Stinner3a50e702011-10-18 21:21:00 +02007581 rep = unicode_encode_call_errorhandler(
7582 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007583 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007584 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007585 if (rep == NULL)
7586 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007587 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007588
7589 if (PyBytes_Check(rep)) {
7590 outsize = PyBytes_GET_SIZE(rep);
7591 if (outsize != 1) {
7592 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7593 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7594 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7595 Py_DECREF(rep);
7596 goto error;
7597 }
7598 out = PyBytes_AS_STRING(*outbytes) + offset;
7599 }
7600 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7601 out += outsize;
7602 }
7603 else {
7604 Py_ssize_t i;
7605 enum PyUnicode_Kind kind;
7606 void *data;
7607
Benjamin Petersonbac79492012-01-14 13:34:47 -05007608 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007609 Py_DECREF(rep);
7610 goto error;
7611 }
7612
7613 outsize = PyUnicode_GET_LENGTH(rep);
7614 if (outsize != 1) {
7615 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7616 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7617 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7618 Py_DECREF(rep);
7619 goto error;
7620 }
7621 out = PyBytes_AS_STRING(*outbytes) + offset;
7622 }
7623 kind = PyUnicode_KIND(rep);
7624 data = PyUnicode_DATA(rep);
7625 for (i=0; i < outsize; i++) {
7626 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7627 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007628 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007629 encoding, unicode,
7630 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007631 "unable to encode error handler result to ASCII");
7632 Py_DECREF(rep);
7633 goto error;
7634 }
7635 *out = (unsigned char)ch;
7636 out++;
7637 }
7638 }
7639 Py_DECREF(rep);
7640 }
7641 /* write a NUL byte */
7642 *out = 0;
7643 outsize = out - PyBytes_AS_STRING(*outbytes);
7644 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7645 if (_PyBytes_Resize(outbytes, outsize) < 0)
7646 goto error;
7647 ret = 0;
7648
7649error:
7650 Py_XDECREF(encoding_obj);
7651 Py_XDECREF(errorHandler);
7652 Py_XDECREF(exc);
7653 return ret;
7654}
7655
Victor Stinner3a50e702011-10-18 21:21:00 +02007656static PyObject *
7657encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007658 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007659 const char *errors)
7660{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007661 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007662 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007663 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007664 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007665
Victor Stinner29dacf22015-01-26 16:41:32 +01007666 if (!PyUnicode_Check(unicode)) {
7667 PyErr_BadArgument();
7668 return NULL;
7669 }
7670
Benjamin Petersonbac79492012-01-14 13:34:47 -05007671 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007672 return NULL;
7673 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007674
Victor Stinner3a50e702011-10-18 21:21:00 +02007675 if (code_page < 0) {
7676 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7677 return NULL;
7678 }
7679
Martin v. Löwis3d325192011-11-04 18:23:06 +01007680 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007681 return PyBytes_FromStringAndSize(NULL, 0);
7682
Victor Stinner7581cef2011-11-03 22:32:33 +01007683 offset = 0;
7684 do
7685 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007686#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007687 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007688 chunks. */
7689 if (len > INT_MAX/2) {
7690 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007691 done = 0;
7692 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007693 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007694#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007695 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007696 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007697 done = 1;
7698 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007699
Victor Stinner76a31a62011-11-04 00:05:13 +01007700 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007701 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007702 errors);
7703 if (ret == -2)
7704 ret = encode_code_page_errors(code_page, &outbytes,
7705 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007706 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007707 if (ret < 0) {
7708 Py_XDECREF(outbytes);
7709 return NULL;
7710 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007711
Victor Stinner7581cef2011-11-03 22:32:33 +01007712 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007713 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007714 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007715
Victor Stinner3a50e702011-10-18 21:21:00 +02007716 return outbytes;
7717}
7718
7719PyObject *
7720PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7721 Py_ssize_t size,
7722 const char *errors)
7723{
Victor Stinner7581cef2011-11-03 22:32:33 +01007724 PyObject *unicode, *res;
7725 unicode = PyUnicode_FromUnicode(p, size);
7726 if (unicode == NULL)
7727 return NULL;
7728 res = encode_code_page(CP_ACP, unicode, errors);
7729 Py_DECREF(unicode);
7730 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007731}
7732
7733PyObject *
7734PyUnicode_EncodeCodePage(int code_page,
7735 PyObject *unicode,
7736 const char *errors)
7737{
Victor Stinner7581cef2011-11-03 22:32:33 +01007738 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007739}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007740
Alexander Belopolsky40018472011-02-26 01:02:56 +00007741PyObject *
7742PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007743{
Victor Stinner7581cef2011-11-03 22:32:33 +01007744 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007745}
7746
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007747#undef NEED_RETRY
7748
Steve Dowercc16be82016-09-08 10:35:16 -07007749#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007750
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751/* --- Character Mapping Codec -------------------------------------------- */
7752
Victor Stinnerfb161b12013-04-18 01:44:27 +02007753static int
7754charmap_decode_string(const char *s,
7755 Py_ssize_t size,
7756 PyObject *mapping,
7757 const char *errors,
7758 _PyUnicodeWriter *writer)
7759{
7760 const char *starts = s;
7761 const char *e;
7762 Py_ssize_t startinpos, endinpos;
7763 PyObject *errorHandler = NULL, *exc = NULL;
7764 Py_ssize_t maplen;
7765 enum PyUnicode_Kind mapkind;
7766 void *mapdata;
7767 Py_UCS4 x;
7768 unsigned char ch;
7769
7770 if (PyUnicode_READY(mapping) == -1)
7771 return -1;
7772
7773 maplen = PyUnicode_GET_LENGTH(mapping);
7774 mapdata = PyUnicode_DATA(mapping);
7775 mapkind = PyUnicode_KIND(mapping);
7776
7777 e = s + size;
7778
7779 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7780 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7781 * is disabled in encoding aliases, latin1 is preferred because
7782 * its implementation is faster. */
7783 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7784 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7785 Py_UCS4 maxchar = writer->maxchar;
7786
7787 assert (writer->kind == PyUnicode_1BYTE_KIND);
7788 while (s < e) {
7789 ch = *s;
7790 x = mapdata_ucs1[ch];
7791 if (x > maxchar) {
7792 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7793 goto onError;
7794 maxchar = writer->maxchar;
7795 outdata = (Py_UCS1 *)writer->data;
7796 }
7797 outdata[writer->pos] = x;
7798 writer->pos++;
7799 ++s;
7800 }
7801 return 0;
7802 }
7803
7804 while (s < e) {
7805 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7806 enum PyUnicode_Kind outkind = writer->kind;
7807 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7808 if (outkind == PyUnicode_1BYTE_KIND) {
7809 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7810 Py_UCS4 maxchar = writer->maxchar;
7811 while (s < e) {
7812 ch = *s;
7813 x = mapdata_ucs2[ch];
7814 if (x > maxchar)
7815 goto Error;
7816 outdata[writer->pos] = x;
7817 writer->pos++;
7818 ++s;
7819 }
7820 break;
7821 }
7822 else if (outkind == PyUnicode_2BYTE_KIND) {
7823 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7824 while (s < e) {
7825 ch = *s;
7826 x = mapdata_ucs2[ch];
7827 if (x == 0xFFFE)
7828 goto Error;
7829 outdata[writer->pos] = x;
7830 writer->pos++;
7831 ++s;
7832 }
7833 break;
7834 }
7835 }
7836 ch = *s;
7837
7838 if (ch < maplen)
7839 x = PyUnicode_READ(mapkind, mapdata, ch);
7840 else
7841 x = 0xfffe; /* invalid value */
7842Error:
7843 if (x == 0xfffe)
7844 {
7845 /* undefined mapping */
7846 startinpos = s-starts;
7847 endinpos = startinpos+1;
7848 if (unicode_decode_call_errorhandler_writer(
7849 errors, &errorHandler,
7850 "charmap", "character maps to <undefined>",
7851 &starts, &e, &startinpos, &endinpos, &exc, &s,
7852 writer)) {
7853 goto onError;
7854 }
7855 continue;
7856 }
7857
7858 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7859 goto onError;
7860 ++s;
7861 }
7862 Py_XDECREF(errorHandler);
7863 Py_XDECREF(exc);
7864 return 0;
7865
7866onError:
7867 Py_XDECREF(errorHandler);
7868 Py_XDECREF(exc);
7869 return -1;
7870}
7871
7872static int
7873charmap_decode_mapping(const char *s,
7874 Py_ssize_t size,
7875 PyObject *mapping,
7876 const char *errors,
7877 _PyUnicodeWriter *writer)
7878{
7879 const char *starts = s;
7880 const char *e;
7881 Py_ssize_t startinpos, endinpos;
7882 PyObject *errorHandler = NULL, *exc = NULL;
7883 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007884 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007885
7886 e = s + size;
7887
7888 while (s < e) {
7889 ch = *s;
7890
7891 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7892 key = PyLong_FromLong((long)ch);
7893 if (key == NULL)
7894 goto onError;
7895
7896 item = PyObject_GetItem(mapping, key);
7897 Py_DECREF(key);
7898 if (item == NULL) {
7899 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7900 /* No mapping found means: mapping is undefined. */
7901 PyErr_Clear();
7902 goto Undefined;
7903 } else
7904 goto onError;
7905 }
7906
7907 /* Apply mapping */
7908 if (item == Py_None)
7909 goto Undefined;
7910 if (PyLong_Check(item)) {
7911 long value = PyLong_AS_LONG(item);
7912 if (value == 0xFFFE)
7913 goto Undefined;
7914 if (value < 0 || value > MAX_UNICODE) {
7915 PyErr_Format(PyExc_TypeError,
7916 "character mapping must be in range(0x%lx)",
7917 (unsigned long)MAX_UNICODE + 1);
7918 goto onError;
7919 }
7920
7921 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7922 goto onError;
7923 }
7924 else if (PyUnicode_Check(item)) {
7925 if (PyUnicode_READY(item) == -1)
7926 goto onError;
7927 if (PyUnicode_GET_LENGTH(item) == 1) {
7928 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7929 if (value == 0xFFFE)
7930 goto Undefined;
7931 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7932 goto onError;
7933 }
7934 else {
7935 writer->overallocate = 1;
7936 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7937 goto onError;
7938 }
7939 }
7940 else {
7941 /* wrong return value */
7942 PyErr_SetString(PyExc_TypeError,
7943 "character mapping must return integer, None or str");
7944 goto onError;
7945 }
7946 Py_CLEAR(item);
7947 ++s;
7948 continue;
7949
7950Undefined:
7951 /* undefined mapping */
7952 Py_CLEAR(item);
7953 startinpos = s-starts;
7954 endinpos = startinpos+1;
7955 if (unicode_decode_call_errorhandler_writer(
7956 errors, &errorHandler,
7957 "charmap", "character maps to <undefined>",
7958 &starts, &e, &startinpos, &endinpos, &exc, &s,
7959 writer)) {
7960 goto onError;
7961 }
7962 }
7963 Py_XDECREF(errorHandler);
7964 Py_XDECREF(exc);
7965 return 0;
7966
7967onError:
7968 Py_XDECREF(item);
7969 Py_XDECREF(errorHandler);
7970 Py_XDECREF(exc);
7971 return -1;
7972}
7973
Alexander Belopolsky40018472011-02-26 01:02:56 +00007974PyObject *
7975PyUnicode_DecodeCharmap(const char *s,
7976 Py_ssize_t size,
7977 PyObject *mapping,
7978 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007980 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007981
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982 /* Default to Latin-1 */
7983 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007987 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007988 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007989 writer.min_length = size;
7990 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007992
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007993 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007994 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7995 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007996 }
7997 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007998 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7999 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008001 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008002
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008004 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005 return NULL;
8006}
8007
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008008/* Charmap encoding: the lookup table */
8009
Alexander Belopolsky40018472011-02-26 01:02:56 +00008010struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 PyObject_HEAD
8012 unsigned char level1[32];
8013 int count2, count3;
8014 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008015};
8016
8017static PyObject*
8018encoding_map_size(PyObject *obj, PyObject* args)
8019{
8020 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008021 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008023}
8024
8025static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008026 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 PyDoc_STR("Return the size (in bytes) of this object") },
8028 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008029};
8030
8031static void
8032encoding_map_dealloc(PyObject* o)
8033{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008034 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008035}
8036
8037static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008038 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 "EncodingMap", /*tp_name*/
8040 sizeof(struct encoding_map), /*tp_basicsize*/
8041 0, /*tp_itemsize*/
8042 /* methods */
8043 encoding_map_dealloc, /*tp_dealloc*/
8044 0, /*tp_print*/
8045 0, /*tp_getattr*/
8046 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008047 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 0, /*tp_repr*/
8049 0, /*tp_as_number*/
8050 0, /*tp_as_sequence*/
8051 0, /*tp_as_mapping*/
8052 0, /*tp_hash*/
8053 0, /*tp_call*/
8054 0, /*tp_str*/
8055 0, /*tp_getattro*/
8056 0, /*tp_setattro*/
8057 0, /*tp_as_buffer*/
8058 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8059 0, /*tp_doc*/
8060 0, /*tp_traverse*/
8061 0, /*tp_clear*/
8062 0, /*tp_richcompare*/
8063 0, /*tp_weaklistoffset*/
8064 0, /*tp_iter*/
8065 0, /*tp_iternext*/
8066 encoding_map_methods, /*tp_methods*/
8067 0, /*tp_members*/
8068 0, /*tp_getset*/
8069 0, /*tp_base*/
8070 0, /*tp_dict*/
8071 0, /*tp_descr_get*/
8072 0, /*tp_descr_set*/
8073 0, /*tp_dictoffset*/
8074 0, /*tp_init*/
8075 0, /*tp_alloc*/
8076 0, /*tp_new*/
8077 0, /*tp_free*/
8078 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008079};
8080
8081PyObject*
8082PyUnicode_BuildEncodingMap(PyObject* string)
8083{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008084 PyObject *result;
8085 struct encoding_map *mresult;
8086 int i;
8087 int need_dict = 0;
8088 unsigned char level1[32];
8089 unsigned char level2[512];
8090 unsigned char *mlevel1, *mlevel2, *mlevel3;
8091 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008092 int kind;
8093 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008094 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008095 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008096
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008097 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008098 PyErr_BadArgument();
8099 return NULL;
8100 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008101 kind = PyUnicode_KIND(string);
8102 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008103 length = PyUnicode_GET_LENGTH(string);
8104 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008105 memset(level1, 0xFF, sizeof level1);
8106 memset(level2, 0xFF, sizeof level2);
8107
8108 /* If there isn't a one-to-one mapping of NULL to \0,
8109 or if there are non-BMP characters, we need to use
8110 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008111 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008112 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008113 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008114 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008115 ch = PyUnicode_READ(kind, data, i);
8116 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008117 need_dict = 1;
8118 break;
8119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008120 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008121 /* unmapped character */
8122 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008123 l1 = ch >> 11;
8124 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008125 if (level1[l1] == 0xFF)
8126 level1[l1] = count2++;
8127 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008128 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008129 }
8130
8131 if (count2 >= 0xFF || count3 >= 0xFF)
8132 need_dict = 1;
8133
8134 if (need_dict) {
8135 PyObject *result = PyDict_New();
8136 PyObject *key, *value;
8137 if (!result)
8138 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008139 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008140 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008141 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008142 if (!key || !value)
8143 goto failed1;
8144 if (PyDict_SetItem(result, key, value) == -1)
8145 goto failed1;
8146 Py_DECREF(key);
8147 Py_DECREF(value);
8148 }
8149 return result;
8150 failed1:
8151 Py_XDECREF(key);
8152 Py_XDECREF(value);
8153 Py_DECREF(result);
8154 return NULL;
8155 }
8156
8157 /* Create a three-level trie */
8158 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8159 16*count2 + 128*count3 - 1);
8160 if (!result)
8161 return PyErr_NoMemory();
8162 PyObject_Init(result, &EncodingMapType);
8163 mresult = (struct encoding_map*)result;
8164 mresult->count2 = count2;
8165 mresult->count3 = count3;
8166 mlevel1 = mresult->level1;
8167 mlevel2 = mresult->level23;
8168 mlevel3 = mresult->level23 + 16*count2;
8169 memcpy(mlevel1, level1, 32);
8170 memset(mlevel2, 0xFF, 16*count2);
8171 memset(mlevel3, 0, 128*count3);
8172 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008173 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008174 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008175 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8176 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008177 /* unmapped character */
8178 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008179 o1 = ch>>11;
8180 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008181 i2 = 16*mlevel1[o1] + o2;
8182 if (mlevel2[i2] == 0xFF)
8183 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008184 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008185 i3 = 128*mlevel2[i2] + o3;
8186 mlevel3[i3] = i;
8187 }
8188 return result;
8189}
8190
8191static int
Victor Stinner22168992011-11-20 17:09:18 +01008192encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008193{
8194 struct encoding_map *map = (struct encoding_map*)mapping;
8195 int l1 = c>>11;
8196 int l2 = (c>>7) & 0xF;
8197 int l3 = c & 0x7F;
8198 int i;
8199
Victor Stinner22168992011-11-20 17:09:18 +01008200 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008202 if (c == 0)
8203 return 0;
8204 /* level 1*/
8205 i = map->level1[l1];
8206 if (i == 0xFF) {
8207 return -1;
8208 }
8209 /* level 2*/
8210 i = map->level23[16*i+l2];
8211 if (i == 0xFF) {
8212 return -1;
8213 }
8214 /* level 3 */
8215 i = map->level23[16*map->count2 + 128*i + l3];
8216 if (i == 0) {
8217 return -1;
8218 }
8219 return i;
8220}
8221
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008222/* Lookup the character ch in the mapping. If the character
8223 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008224 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008225static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008226charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227{
Christian Heimes217cfd12007-12-02 14:31:20 +00008228 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008229 PyObject *x;
8230
8231 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008233 x = PyObject_GetItem(mapping, w);
8234 Py_DECREF(w);
8235 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8237 /* No mapping found means: mapping is undefined. */
8238 PyErr_Clear();
8239 x = Py_None;
8240 Py_INCREF(x);
8241 return x;
8242 } else
8243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008245 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008247 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 long value = PyLong_AS_LONG(x);
8249 if (value < 0 || value > 255) {
8250 PyErr_SetString(PyExc_TypeError,
8251 "character mapping must be in range(256)");
8252 Py_DECREF(x);
8253 return NULL;
8254 }
8255 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008257 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 /* wrong return value */
8261 PyErr_Format(PyExc_TypeError,
8262 "character mapping must return integer, bytes or None, not %.400s",
8263 x->ob_type->tp_name);
8264 Py_DECREF(x);
8265 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266 }
8267}
8268
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008269static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008270charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008271{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008272 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8273 /* exponentially overallocate to minimize reallocations */
8274 if (requiredsize < 2*outsize)
8275 requiredsize = 2*outsize;
8276 if (_PyBytes_Resize(outobj, requiredsize))
8277 return -1;
8278 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008279}
8280
Benjamin Peterson14339b62009-01-31 16:36:08 +00008281typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008283} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008284/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008285 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008286 space is available. Return a new reference to the object that
8287 was put in the output buffer, or Py_None, if the mapping was undefined
8288 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008289 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008290static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008291charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008292 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008293{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008294 PyObject *rep;
8295 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008296 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008297
Christian Heimes90aa7642007-12-19 02:45:37 +00008298 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008299 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008301 if (res == -1)
8302 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 if (outsize<requiredsize)
8304 if (charmapencode_resize(outobj, outpos, requiredsize))
8305 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008306 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 outstart[(*outpos)++] = (char)res;
8308 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008309 }
8310
8311 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008312 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008314 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 Py_DECREF(rep);
8316 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008317 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 if (PyLong_Check(rep)) {
8319 Py_ssize_t requiredsize = *outpos+1;
8320 if (outsize<requiredsize)
8321 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8322 Py_DECREF(rep);
8323 return enc_EXCEPTION;
8324 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008325 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008327 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 else {
8329 const char *repchars = PyBytes_AS_STRING(rep);
8330 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8331 Py_ssize_t requiredsize = *outpos+repsize;
8332 if (outsize<requiredsize)
8333 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8334 Py_DECREF(rep);
8335 return enc_EXCEPTION;
8336 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008337 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 memcpy(outstart + *outpos, repchars, repsize);
8339 *outpos += repsize;
8340 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008341 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008342 Py_DECREF(rep);
8343 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008344}
8345
8346/* handle an error in PyUnicode_EncodeCharmap
8347 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008348static int
8349charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008350 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008351 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008352 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008353 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008354{
8355 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008356 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008357 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008358 enum PyUnicode_Kind kind;
8359 void *data;
8360 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008361 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008362 Py_ssize_t collstartpos = *inpos;
8363 Py_ssize_t collendpos = *inpos+1;
8364 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365 char *encoding = "charmap";
8366 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008367 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008368 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008369 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370
Benjamin Petersonbac79492012-01-14 13:34:47 -05008371 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008372 return -1;
8373 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 /* find all unencodable characters */
8375 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008376 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008377 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008378 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008379 val = encoding_map_lookup(ch, mapping);
8380 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 break;
8382 ++collendpos;
8383 continue;
8384 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008385
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008386 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8387 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 if (rep==NULL)
8389 return -1;
8390 else if (rep!=Py_None) {
8391 Py_DECREF(rep);
8392 break;
8393 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008394 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396 }
8397 /* cache callback name lookup
8398 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008399 if (*error_handler == _Py_ERROR_UNKNOWN)
8400 *error_handler = get_error_handler(errors);
8401
8402 switch (*error_handler) {
8403 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008404 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008405 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008406
8407 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008408 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 x = charmapencode_output('?', mapping, res, respos);
8410 if (x==enc_EXCEPTION) {
8411 return -1;
8412 }
8413 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008414 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 return -1;
8416 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008417 }
8418 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008419 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008420 *inpos = collendpos;
8421 break;
Victor Stinner50149202015-09-22 00:26:54 +02008422
8423 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008424 /* generate replacement (temporarily (mis)uses p) */
8425 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 char buffer[2+29+1+1];
8427 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008428 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 for (cp = buffer; *cp; ++cp) {
8430 x = charmapencode_output(*cp, mapping, res, respos);
8431 if (x==enc_EXCEPTION)
8432 return -1;
8433 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008434 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008435 return -1;
8436 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008437 }
8438 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008439 *inpos = collendpos;
8440 break;
Victor Stinner50149202015-09-22 00:26:54 +02008441
Benjamin Peterson14339b62009-01-31 16:36:08 +00008442 default:
Victor Stinner50149202015-09-22 00:26:54 +02008443 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008444 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008446 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008448 if (PyBytes_Check(repunicode)) {
8449 /* Directly copy bytes result to output. */
8450 Py_ssize_t outsize = PyBytes_Size(*res);
8451 Py_ssize_t requiredsize;
8452 repsize = PyBytes_Size(repunicode);
8453 requiredsize = *respos + repsize;
8454 if (requiredsize > outsize)
8455 /* Make room for all additional bytes. */
8456 if (charmapencode_resize(res, respos, requiredsize)) {
8457 Py_DECREF(repunicode);
8458 return -1;
8459 }
8460 memcpy(PyBytes_AsString(*res) + *respos,
8461 PyBytes_AsString(repunicode), repsize);
8462 *respos += repsize;
8463 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008464 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008465 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008466 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008467 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008468 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008469 Py_DECREF(repunicode);
8470 return -1;
8471 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008472 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008473 data = PyUnicode_DATA(repunicode);
8474 kind = PyUnicode_KIND(repunicode);
8475 for (index = 0; index < repsize; index++) {
8476 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8477 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008479 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 return -1;
8481 }
8482 else if (x==enc_FAILED) {
8483 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008484 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 return -1;
8486 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008487 }
8488 *inpos = newpos;
8489 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490 }
8491 return 0;
8492}
8493
Alexander Belopolsky40018472011-02-26 01:02:56 +00008494PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008495_PyUnicode_EncodeCharmap(PyObject *unicode,
8496 PyObject *mapping,
8497 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008499 /* output object */
8500 PyObject *res = NULL;
8501 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008502 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008503 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008504 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008505 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008506 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008508 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008509 void *data;
8510 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511
Benjamin Petersonbac79492012-01-14 13:34:47 -05008512 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008513 return NULL;
8514 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008515 data = PyUnicode_DATA(unicode);
8516 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008517
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518 /* Default to Latin-1 */
8519 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008520 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008522 /* allocate enough for a simple encoding without
8523 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008524 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525 if (res == NULL)
8526 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008527 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008531 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008533 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 if (x==enc_EXCEPTION) /* error */
8535 goto onError;
8536 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008537 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008538 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008539 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 &res, &respos)) {
8541 goto onError;
8542 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008543 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 else
8545 /* done with this character => adjust input position */
8546 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008549 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008550 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008551 if (_PyBytes_Resize(&res, respos) < 0)
8552 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008554 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008555 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008556 return res;
8557
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008559 Py_XDECREF(res);
8560 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008561 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562 return NULL;
8563}
8564
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008565/* Deprecated */
8566PyObject *
8567PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8568 Py_ssize_t size,
8569 PyObject *mapping,
8570 const char *errors)
8571{
8572 PyObject *result;
8573 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8574 if (unicode == NULL)
8575 return NULL;
8576 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8577 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008578 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008579}
8580
Alexander Belopolsky40018472011-02-26 01:02:56 +00008581PyObject *
8582PyUnicode_AsCharmapString(PyObject *unicode,
8583 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584{
8585 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 PyErr_BadArgument();
8587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008589 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590}
8591
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008593static void
8594make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008596 Py_ssize_t startpos, Py_ssize_t endpos,
8597 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600 *exceptionObject = _PyUnicodeTranslateError_Create(
8601 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602 }
8603 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8605 goto onError;
8606 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8607 goto onError;
8608 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8609 goto onError;
8610 return;
8611 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008612 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613 }
8614}
8615
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008616/* error handling callback helper:
8617 build arguments, call the callback and check the arguments,
8618 put the result into newpos and return the replacement string, which
8619 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008620static PyObject *
8621unicode_translate_call_errorhandler(const char *errors,
8622 PyObject **errorHandler,
8623 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008625 Py_ssize_t startpos, Py_ssize_t endpos,
8626 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008627{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02008628 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008629
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008630 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008631 PyObject *restuple;
8632 PyObject *resunicode;
8633
8634 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008636 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008638 }
8639
8640 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008641 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644
8645 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008649 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008650 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 Py_DECREF(restuple);
8652 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008653 }
8654 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 &resunicode, &i_newpos)) {
8656 Py_DECREF(restuple);
8657 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008658 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008659 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008661 else
8662 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008664 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 Py_DECREF(restuple);
8666 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008667 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668 Py_INCREF(resunicode);
8669 Py_DECREF(restuple);
8670 return resunicode;
8671}
8672
8673/* Lookup the character ch in the mapping and put the result in result,
8674 which must be decrefed by the caller.
8675 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008676static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008678{
Christian Heimes217cfd12007-12-02 14:31:20 +00008679 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680 PyObject *x;
8681
8682 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008684 x = PyObject_GetItem(mapping, w);
8685 Py_DECREF(w);
8686 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8688 /* No mapping found means: use 1:1 mapping. */
8689 PyErr_Clear();
8690 *result = NULL;
8691 return 0;
8692 } else
8693 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008694 }
8695 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 *result = x;
8697 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008698 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008699 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008701 if (value < 0 || value > MAX_UNICODE) {
8702 PyErr_Format(PyExc_ValueError,
8703 "character mapping must be in range(0x%x)",
8704 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 Py_DECREF(x);
8706 return -1;
8707 }
8708 *result = x;
8709 return 0;
8710 }
8711 else if (PyUnicode_Check(x)) {
8712 *result = x;
8713 return 0;
8714 }
8715 else {
8716 /* wrong return value */
8717 PyErr_SetString(PyExc_TypeError,
8718 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008719 Py_DECREF(x);
8720 return -1;
8721 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008722}
Victor Stinner1194ea02014-04-04 19:37:40 +02008723
8724/* lookup the character, write the result into the writer.
8725 Return 1 if the result was written into the writer, return 0 if the mapping
8726 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008727static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008728charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8729 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008730{
Victor Stinner1194ea02014-04-04 19:37:40 +02008731 PyObject *item;
8732
8733 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008735
8736 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008738 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008741 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008742 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008743
8744 if (item == Py_None) {
8745 Py_DECREF(item);
8746 return 0;
8747 }
8748
8749 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008750 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8751 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8752 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008753 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8754 Py_DECREF(item);
8755 return -1;
8756 }
8757 Py_DECREF(item);
8758 return 1;
8759 }
8760
8761 if (!PyUnicode_Check(item)) {
8762 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008764 }
8765
8766 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8767 Py_DECREF(item);
8768 return -1;
8769 }
8770
8771 Py_DECREF(item);
8772 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008773}
8774
Victor Stinner89a76ab2014-04-05 11:44:04 +02008775static int
8776unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8777 Py_UCS1 *translate)
8778{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008779 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008780 int ret = 0;
8781
Victor Stinner89a76ab2014-04-05 11:44:04 +02008782 if (charmaptranslate_lookup(ch, mapping, &item)) {
8783 return -1;
8784 }
8785
8786 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008787 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008788 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008789 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008790 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008791 /* not found => default to 1:1 mapping */
8792 translate[ch] = ch;
8793 return 1;
8794 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008795 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008796 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008797 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8798 used it */
8799 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008800 /* invalid character or character outside ASCII:
8801 skip the fast translate */
8802 goto exit;
8803 }
8804 translate[ch] = (Py_UCS1)replace;
8805 }
8806 else if (PyUnicode_Check(item)) {
8807 Py_UCS4 replace;
8808
8809 if (PyUnicode_READY(item) == -1) {
8810 Py_DECREF(item);
8811 return -1;
8812 }
8813 if (PyUnicode_GET_LENGTH(item) != 1)
8814 goto exit;
8815
8816 replace = PyUnicode_READ_CHAR(item, 0);
8817 if (replace > 127)
8818 goto exit;
8819 translate[ch] = (Py_UCS1)replace;
8820 }
8821 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008822 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008823 goto exit;
8824 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008825 ret = 1;
8826
Benjamin Peterson1365de72014-04-07 20:15:41 -04008827 exit:
8828 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008829 return ret;
8830}
8831
8832/* Fast path for ascii => ascii translation. Return 1 if the whole string
8833 was translated into writer, return 0 if the input string was partially
8834 translated into writer, raise an exception and return -1 on error. */
8835static int
8836unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008837 _PyUnicodeWriter *writer, int ignore,
8838 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008839{
Victor Stinner872b2912014-04-05 14:27:07 +02008840 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008841 Py_ssize_t len;
8842 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008843 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008844
Victor Stinner89a76ab2014-04-05 11:44:04 +02008845 len = PyUnicode_GET_LENGTH(input);
8846
Victor Stinner872b2912014-04-05 14:27:07 +02008847 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008848
8849 in = PyUnicode_1BYTE_DATA(input);
8850 end = in + len;
8851
8852 assert(PyUnicode_IS_ASCII(writer->buffer));
8853 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8854 out = PyUnicode_1BYTE_DATA(writer->buffer);
8855
Victor Stinner872b2912014-04-05 14:27:07 +02008856 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008857 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008858 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008859 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008860 int translate = unicode_fast_translate_lookup(mapping, ch,
8861 ascii_table);
8862 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008863 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008864 if (translate == 0)
8865 goto exit;
8866 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008867 }
Victor Stinner872b2912014-04-05 14:27:07 +02008868 if (ch2 == 0xfe) {
8869 if (ignore)
8870 continue;
8871 goto exit;
8872 }
8873 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008874 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008875 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008876 }
Victor Stinner872b2912014-04-05 14:27:07 +02008877 res = 1;
8878
8879exit:
8880 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008881 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008882 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008883}
8884
Victor Stinner3222da22015-10-01 22:07:32 +02008885static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886_PyUnicode_TranslateCharmap(PyObject *input,
8887 PyObject *mapping,
8888 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008890 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008891 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008892 Py_ssize_t size, i;
8893 int kind;
8894 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008895 _PyUnicodeWriter writer;
8896 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008897 char *reason = "character maps to <undefined>";
8898 PyObject *errorHandler = NULL;
8899 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008900 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008901 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008902
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008904 PyErr_BadArgument();
8905 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008906 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908 if (PyUnicode_READY(input) == -1)
8909 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008910 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008911 kind = PyUnicode_KIND(input);
8912 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008914 if (size == 0)
8915 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008917 /* allocate enough for a simple 1:1 translation without
8918 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008919 _PyUnicodeWriter_Init(&writer);
8920 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008921 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922
Victor Stinner872b2912014-04-05 14:27:07 +02008923 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8924
Victor Stinner33798672016-03-01 21:59:58 +01008925 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008926 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008927 if (PyUnicode_IS_ASCII(input)) {
8928 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8929 if (res < 0) {
8930 _PyUnicodeWriter_Dealloc(&writer);
8931 return NULL;
8932 }
8933 if (res == 1)
8934 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008935 }
Victor Stinner33798672016-03-01 21:59:58 +01008936 else {
8937 i = 0;
8938 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008941 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008942 int translate;
8943 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8944 Py_ssize_t newpos;
8945 /* startpos for collecting untranslatable chars */
8946 Py_ssize_t collstart;
8947 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008948 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949
Victor Stinner1194ea02014-04-04 19:37:40 +02008950 ch = PyUnicode_READ(kind, data, i);
8951 translate = charmaptranslate_output(ch, mapping, &writer);
8952 if (translate < 0)
8953 goto onError;
8954
8955 if (translate != 0) {
8956 /* it worked => adjust input pointer */
8957 ++i;
8958 continue;
8959 }
8960
8961 /* untranslatable character */
8962 collstart = i;
8963 collend = i+1;
8964
8965 /* find all untranslatable characters */
8966 while (collend < size) {
8967 PyObject *x;
8968 ch = PyUnicode_READ(kind, data, collend);
8969 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008970 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008971 Py_XDECREF(x);
8972 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008974 ++collend;
8975 }
8976
8977 if (ignore) {
8978 i = collend;
8979 }
8980 else {
8981 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8982 reason, input, &exc,
8983 collstart, collend, &newpos);
8984 if (repunicode == NULL)
8985 goto onError;
8986 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008987 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008988 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008989 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008990 Py_DECREF(repunicode);
8991 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008992 }
8993 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008994 Py_XDECREF(exc);
8995 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008996 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008999 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009000 Py_XDECREF(exc);
9001 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002 return NULL;
9003}
9004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009005/* Deprecated. Use PyUnicode_Translate instead. */
9006PyObject *
9007PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9008 Py_ssize_t size,
9009 PyObject *mapping,
9010 const char *errors)
9011{
Christian Heimes5f520f42012-09-11 14:03:25 +02009012 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 PyObject *unicode = PyUnicode_FromUnicode(p, size);
9014 if (!unicode)
9015 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009016 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9017 Py_DECREF(unicode);
9018 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009019}
9020
Alexander Belopolsky40018472011-02-26 01:02:56 +00009021PyObject *
9022PyUnicode_Translate(PyObject *str,
9023 PyObject *mapping,
9024 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009026 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009027 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009028 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029}
Tim Petersced69f82003-09-16 20:30:58 +00009030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009032fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009033{
9034 /* No need to call PyUnicode_READY(self) because this function is only
9035 called as a callback from fixup() which does it already. */
9036 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9037 const int kind = PyUnicode_KIND(self);
9038 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009039 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009040 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009041 Py_ssize_t i;
9042
9043 for (i = 0; i < len; ++i) {
9044 ch = PyUnicode_READ(kind, data, i);
9045 fixed = 0;
9046 if (ch > 127) {
9047 if (Py_UNICODE_ISSPACE(ch))
9048 fixed = ' ';
9049 else {
9050 const int decimal = Py_UNICODE_TODECIMAL(ch);
9051 if (decimal >= 0)
9052 fixed = '0' + decimal;
9053 }
9054 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009055 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009056 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 PyUnicode_WRITE(kind, data, i, fixed);
9058 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009059 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009060 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062 }
9063
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009064 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009065}
9066
9067PyObject *
9068_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9069{
9070 if (!PyUnicode_Check(unicode)) {
9071 PyErr_BadInternalCall();
9072 return NULL;
9073 }
9074 if (PyUnicode_READY(unicode) == -1)
9075 return NULL;
9076 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9077 /* If the string is already ASCII, just return the same string */
9078 Py_INCREF(unicode);
9079 return unicode;
9080 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009081 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082}
9083
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009084PyObject *
9085PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9086 Py_ssize_t length)
9087{
Victor Stinnerf0124502011-11-21 23:12:56 +01009088 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009089 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009090 Py_UCS4 maxchar;
9091 enum PyUnicode_Kind kind;
9092 void *data;
9093
Victor Stinner99d7ad02012-02-22 13:37:39 +01009094 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009095 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009096 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009097 if (ch > 127) {
9098 int decimal = Py_UNICODE_TODECIMAL(ch);
9099 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009100 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009101 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009102 }
9103 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009104
9105 /* Copy to a new string */
9106 decimal = PyUnicode_New(length, maxchar);
9107 if (decimal == NULL)
9108 return decimal;
9109 kind = PyUnicode_KIND(decimal);
9110 data = PyUnicode_DATA(decimal);
9111 /* Iterate over code points */
9112 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009113 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009114 if (ch > 127) {
9115 int decimal = Py_UNICODE_TODECIMAL(ch);
9116 if (decimal >= 0)
9117 ch = '0' + decimal;
9118 }
9119 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009121 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009122}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009123/* --- Decimal Encoder ---------------------------------------------------- */
9124
Alexander Belopolsky40018472011-02-26 01:02:56 +00009125int
9126PyUnicode_EncodeDecimal(Py_UNICODE *s,
9127 Py_ssize_t length,
9128 char *output,
9129 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009130{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009131 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009132 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009133 enum PyUnicode_Kind kind;
9134 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009135
9136 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 PyErr_BadArgument();
9138 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009139 }
9140
Victor Stinner42bf7752011-11-21 22:52:58 +01009141 unicode = PyUnicode_FromUnicode(s, length);
9142 if (unicode == NULL)
9143 return -1;
9144
Benjamin Petersonbac79492012-01-14 13:34:47 -05009145 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009146 Py_DECREF(unicode);
9147 return -1;
9148 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009149 kind = PyUnicode_KIND(unicode);
9150 data = PyUnicode_DATA(unicode);
9151
Victor Stinnerb84d7232011-11-22 01:50:07 +01009152 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009153 PyObject *exc;
9154 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009156 Py_ssize_t startpos;
9157
9158 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009159
Benjamin Peterson29060642009-01-31 22:14:21 +00009160 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009161 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009162 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009163 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009164 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009165 decimal = Py_UNICODE_TODECIMAL(ch);
9166 if (decimal >= 0) {
9167 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009168 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009169 continue;
9170 }
9171 if (0 < ch && ch < 256) {
9172 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009173 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009174 continue;
9175 }
Victor Stinner6345be92011-11-25 20:09:01 +01009176
Victor Stinner42bf7752011-11-21 22:52:58 +01009177 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009178 exc = NULL;
9179 raise_encode_exception(&exc, "decimal", unicode,
9180 startpos, startpos+1,
9181 "invalid decimal Unicode string");
9182 Py_XDECREF(exc);
9183 Py_DECREF(unicode);
9184 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009185 }
9186 /* 0-terminate the output string */
9187 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009188 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009189 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009190}
9191
Guido van Rossumd57fd912000-03-10 22:53:23 +00009192/* --- Helpers ------------------------------------------------------------ */
9193
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009194/* helper macro to fixup start/end slice values */
9195#define ADJUST_INDICES(start, end, len) \
9196 if (end > len) \
9197 end = len; \
9198 else if (end < 0) { \
9199 end += len; \
9200 if (end < 0) \
9201 end = 0; \
9202 } \
9203 if (start < 0) { \
9204 start += len; \
9205 if (start < 0) \
9206 start = 0; \
9207 }
9208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009209static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009210any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009211 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009212 Py_ssize_t end,
9213 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009214{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009215 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009216 void *buf1, *buf2;
9217 Py_ssize_t len1, len2, result;
9218
9219 kind1 = PyUnicode_KIND(s1);
9220 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009221 if (kind1 < kind2)
9222 return -1;
9223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224 len1 = PyUnicode_GET_LENGTH(s1);
9225 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009226 ADJUST_INDICES(start, end, len1);
9227 if (end - start < len2)
9228 return -1;
9229
9230 buf1 = PyUnicode_DATA(s1);
9231 buf2 = PyUnicode_DATA(s2);
9232 if (len2 == 1) {
9233 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9234 result = findchar((const char *)buf1 + kind1*start,
9235 kind1, end - start, ch, direction);
9236 if (result == -1)
9237 return -1;
9238 else
9239 return start + result;
9240 }
9241
9242 if (kind2 != kind1) {
9243 buf2 = _PyUnicode_AsKind(s2, kind1);
9244 if (!buf2)
9245 return -2;
9246 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247
Victor Stinner794d5672011-10-10 03:21:36 +02009248 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009249 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009250 case PyUnicode_1BYTE_KIND:
9251 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9252 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9253 else
9254 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9255 break;
9256 case PyUnicode_2BYTE_KIND:
9257 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9258 break;
9259 case PyUnicode_4BYTE_KIND:
9260 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9261 break;
9262 default:
9263 assert(0); result = -2;
9264 }
9265 }
9266 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009267 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009268 case PyUnicode_1BYTE_KIND:
9269 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9270 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9271 else
9272 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9273 break;
9274 case PyUnicode_2BYTE_KIND:
9275 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9276 break;
9277 case PyUnicode_4BYTE_KIND:
9278 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9279 break;
9280 default:
9281 assert(0); result = -2;
9282 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 }
9284
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009285 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286 PyMem_Free(buf2);
9287
9288 return result;
9289}
9290
9291Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009292_PyUnicode_InsertThousandsGrouping(
9293 PyObject *unicode, Py_ssize_t index,
9294 Py_ssize_t n_buffer,
9295 void *digits, Py_ssize_t n_digits,
9296 Py_ssize_t min_width,
9297 const char *grouping, PyObject *thousands_sep,
9298 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009299{
Victor Stinner41a863c2012-02-24 00:37:51 +01009300 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009301 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009302 Py_ssize_t thousands_sep_len;
9303 Py_ssize_t len;
9304
9305 if (unicode != NULL) {
9306 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009307 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009308 }
9309 else {
9310 kind = PyUnicode_1BYTE_KIND;
9311 data = NULL;
9312 }
9313 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9314 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9315 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9316 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009317 if (thousands_sep_kind < kind) {
9318 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9319 if (!thousands_sep_data)
9320 return -1;
9321 }
9322 else {
9323 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9324 if (!data)
9325 return -1;
9326 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009327 }
9328
Benjamin Petersonead6b532011-12-20 17:23:42 -06009329 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009331 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009332 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009333 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009334 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009335 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009336 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009337 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009338 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009339 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009340 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009341 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009343 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009344 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009345 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009346 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009347 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009349 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009350 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009351 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009352 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009353 break;
9354 default:
9355 assert(0);
9356 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009357 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009358 if (unicode != NULL && thousands_sep_kind != kind) {
9359 if (thousands_sep_kind < kind)
9360 PyMem_Free(thousands_sep_data);
9361 else
9362 PyMem_Free(data);
9363 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009364 if (unicode == NULL) {
9365 *maxchar = 127;
9366 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009367 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009368 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009369 }
9370 }
9371 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372}
9373
9374
Alexander Belopolsky40018472011-02-26 01:02:56 +00009375Py_ssize_t
9376PyUnicode_Count(PyObject *str,
9377 PyObject *substr,
9378 Py_ssize_t start,
9379 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009381 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009382 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383 void *buf1 = NULL, *buf2 = NULL;
9384 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009385
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009386 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009387 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009388
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009389 kind1 = PyUnicode_KIND(str);
9390 kind2 = PyUnicode_KIND(substr);
9391 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009392 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009393
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009394 len1 = PyUnicode_GET_LENGTH(str);
9395 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009396 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009397 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009398 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009399
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009400 buf1 = PyUnicode_DATA(str);
9401 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009402 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009403 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009404 if (!buf2)
9405 goto onError;
9406 }
9407
9408 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009410 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009411 result = asciilib_count(
9412 ((Py_UCS1*)buf1) + start, end - start,
9413 buf2, len2, PY_SSIZE_T_MAX
9414 );
9415 else
9416 result = ucs1lib_count(
9417 ((Py_UCS1*)buf1) + start, end - start,
9418 buf2, len2, PY_SSIZE_T_MAX
9419 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 break;
9421 case PyUnicode_2BYTE_KIND:
9422 result = ucs2lib_count(
9423 ((Py_UCS2*)buf1) + start, end - start,
9424 buf2, len2, PY_SSIZE_T_MAX
9425 );
9426 break;
9427 case PyUnicode_4BYTE_KIND:
9428 result = ucs4lib_count(
9429 ((Py_UCS4*)buf1) + start, end - start,
9430 buf2, len2, PY_SSIZE_T_MAX
9431 );
9432 break;
9433 default:
9434 assert(0); result = 0;
9435 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009436
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009437 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 PyMem_Free(buf2);
9439
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009442 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 PyMem_Free(buf2);
9444 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445}
9446
Alexander Belopolsky40018472011-02-26 01:02:56 +00009447Py_ssize_t
9448PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009449 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009450 Py_ssize_t start,
9451 Py_ssize_t end,
9452 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009454 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009455 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009456
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009457 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009458}
9459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460Py_ssize_t
9461PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9462 Py_ssize_t start, Py_ssize_t end,
9463 int direction)
9464{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009466 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 if (PyUnicode_READY(str) == -1)
9468 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009469 if (start < 0 || end < 0) {
9470 PyErr_SetString(PyExc_IndexError, "string index out of range");
9471 return -2;
9472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 if (end > PyUnicode_GET_LENGTH(str))
9474 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009475 if (start >= end)
9476 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009478 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9479 kind, end-start, ch, direction);
9480 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009482 else
9483 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484}
9485
Alexander Belopolsky40018472011-02-26 01:02:56 +00009486static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009487tailmatch(PyObject *self,
9488 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009489 Py_ssize_t start,
9490 Py_ssize_t end,
9491 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493 int kind_self;
9494 int kind_sub;
9495 void *data_self;
9496 void *data_sub;
9497 Py_ssize_t offset;
9498 Py_ssize_t i;
9499 Py_ssize_t end_sub;
9500
9501 if (PyUnicode_READY(self) == -1 ||
9502 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009503 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9506 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009508 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009510 if (PyUnicode_GET_LENGTH(substring) == 0)
9511 return 1;
9512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 kind_self = PyUnicode_KIND(self);
9514 data_self = PyUnicode_DATA(self);
9515 kind_sub = PyUnicode_KIND(substring);
9516 data_sub = PyUnicode_DATA(substring);
9517 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9518
9519 if (direction > 0)
9520 offset = end;
9521 else
9522 offset = start;
9523
9524 if (PyUnicode_READ(kind_self, data_self, offset) ==
9525 PyUnicode_READ(kind_sub, data_sub, 0) &&
9526 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9527 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9528 /* If both are of the same kind, memcmp is sufficient */
9529 if (kind_self == kind_sub) {
9530 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009531 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 data_sub,
9533 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009534 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009536 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 else {
9538 /* We do not need to compare 0 and len(substring)-1 because
9539 the if statement above ensured already that they are equal
9540 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 for (i = 1; i < end_sub; ++i) {
9542 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9543 PyUnicode_READ(kind_sub, data_sub, i))
9544 return 0;
9545 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009546 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009547 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548 }
9549
9550 return 0;
9551}
9552
Alexander Belopolsky40018472011-02-26 01:02:56 +00009553Py_ssize_t
9554PyUnicode_Tailmatch(PyObject *str,
9555 PyObject *substr,
9556 Py_ssize_t start,
9557 Py_ssize_t end,
9558 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009559{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009560 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009561 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009562
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009563 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009564}
9565
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566/* Apply fixfct filter to the Unicode object self and return a
9567 reference to the modified object */
9568
Alexander Belopolsky40018472011-02-26 01:02:56 +00009569static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009570fixup(PyObject *self,
9571 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573 PyObject *u;
9574 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009575 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009576
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009577 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009579 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009580 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582 /* fix functions return the new maximum character in a string,
9583 if the kind of the resulting unicode object does not change,
9584 everything is fine. Otherwise we need to change the string kind
9585 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009586 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009587
9588 if (maxchar_new == 0) {
9589 /* no changes */;
9590 if (PyUnicode_CheckExact(self)) {
9591 Py_DECREF(u);
9592 Py_INCREF(self);
9593 return self;
9594 }
9595 else
9596 return u;
9597 }
9598
Victor Stinnere6abb482012-05-02 01:15:40 +02009599 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009600
Victor Stinnereaab6042011-12-11 22:22:39 +01009601 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009602 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009603
9604 /* In case the maximum character changed, we need to
9605 convert the string to the new category. */
9606 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9607 if (v == NULL) {
9608 Py_DECREF(u);
9609 return NULL;
9610 }
9611 if (maxchar_new > maxchar_old) {
9612 /* If the maxchar increased so that the kind changed, not all
9613 characters are representable anymore and we need to fix the
9614 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009615 _PyUnicode_FastCopyCharacters(v, 0,
9616 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009617 maxchar_old = fixfct(v);
9618 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619 }
9620 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009621 _PyUnicode_FastCopyCharacters(v, 0,
9622 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009624 Py_DECREF(u);
9625 assert(_PyUnicode_CheckConsistency(v, 1));
9626 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009627}
9628
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009629static PyObject *
9630ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009632 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9633 char *resdata, *data = PyUnicode_DATA(self);
9634 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009635
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009636 res = PyUnicode_New(len, 127);
9637 if (res == NULL)
9638 return NULL;
9639 resdata = PyUnicode_DATA(res);
9640 if (lower)
9641 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009643 _Py_bytes_upper(resdata, data, len);
9644 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645}
9646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009648handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009650 Py_ssize_t j;
9651 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009652 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009653 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009654
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009655 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9656
9657 where ! is a negation and \p{xxx} is a character with property xxx.
9658 */
9659 for (j = i - 1; j >= 0; j--) {
9660 c = PyUnicode_READ(kind, data, j);
9661 if (!_PyUnicode_IsCaseIgnorable(c))
9662 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009664 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9665 if (final_sigma) {
9666 for (j = i + 1; j < length; j++) {
9667 c = PyUnicode_READ(kind, data, j);
9668 if (!_PyUnicode_IsCaseIgnorable(c))
9669 break;
9670 }
9671 final_sigma = j == length || !_PyUnicode_IsCased(c);
9672 }
9673 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674}
9675
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009676static int
9677lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9678 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009680 /* Obscure special case. */
9681 if (c == 0x3A3) {
9682 mapped[0] = handle_capital_sigma(kind, data, length, i);
9683 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009685 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686}
9687
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009688static Py_ssize_t
9689do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009690{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009691 Py_ssize_t i, k = 0;
9692 int n_res, j;
9693 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009694
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009695 c = PyUnicode_READ(kind, data, 0);
9696 n_res = _PyUnicode_ToUpperFull(c, mapped);
9697 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009698 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009699 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009701 for (i = 1; i < length; i++) {
9702 c = PyUnicode_READ(kind, data, i);
9703 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9704 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009705 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009706 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009707 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009708 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009709 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710}
9711
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009712static Py_ssize_t
9713do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9714 Py_ssize_t i, k = 0;
9715
9716 for (i = 0; i < length; i++) {
9717 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9718 int n_res, j;
9719 if (Py_UNICODE_ISUPPER(c)) {
9720 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9721 }
9722 else if (Py_UNICODE_ISLOWER(c)) {
9723 n_res = _PyUnicode_ToUpperFull(c, mapped);
9724 }
9725 else {
9726 n_res = 1;
9727 mapped[0] = c;
9728 }
9729 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009730 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009731 res[k++] = mapped[j];
9732 }
9733 }
9734 return k;
9735}
9736
9737static Py_ssize_t
9738do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9739 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009741 Py_ssize_t i, k = 0;
9742
9743 for (i = 0; i < length; i++) {
9744 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9745 int n_res, j;
9746 if (lower)
9747 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9748 else
9749 n_res = _PyUnicode_ToUpperFull(c, mapped);
9750 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009751 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009752 res[k++] = mapped[j];
9753 }
9754 }
9755 return k;
9756}
9757
9758static Py_ssize_t
9759do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9760{
9761 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9762}
9763
9764static Py_ssize_t
9765do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9766{
9767 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9768}
9769
Benjamin Petersone51757f2012-01-12 21:10:29 -05009770static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009771do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9772{
9773 Py_ssize_t i, k = 0;
9774
9775 for (i = 0; i < length; i++) {
9776 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9777 Py_UCS4 mapped[3];
9778 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9779 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009780 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009781 res[k++] = mapped[j];
9782 }
9783 }
9784 return k;
9785}
9786
9787static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009788do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9789{
9790 Py_ssize_t i, k = 0;
9791 int previous_is_cased;
9792
9793 previous_is_cased = 0;
9794 for (i = 0; i < length; i++) {
9795 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9796 Py_UCS4 mapped[3];
9797 int n_res, j;
9798
9799 if (previous_is_cased)
9800 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9801 else
9802 n_res = _PyUnicode_ToTitleFull(c, mapped);
9803
9804 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009805 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009806 res[k++] = mapped[j];
9807 }
9808
9809 previous_is_cased = _PyUnicode_IsCased(c);
9810 }
9811 return k;
9812}
9813
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009814static PyObject *
9815case_operation(PyObject *self,
9816 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9817{
9818 PyObject *res = NULL;
9819 Py_ssize_t length, newlength = 0;
9820 int kind, outkind;
9821 void *data, *outdata;
9822 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9823
Benjamin Petersoneea48462012-01-16 14:28:50 -05009824 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009825
9826 kind = PyUnicode_KIND(self);
9827 data = PyUnicode_DATA(self);
9828 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009829 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009830 PyErr_SetString(PyExc_OverflowError, "string is too long");
9831 return NULL;
9832 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009833 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009834 if (tmp == NULL)
9835 return PyErr_NoMemory();
9836 newlength = perform(kind, data, length, tmp, &maxchar);
9837 res = PyUnicode_New(newlength, maxchar);
9838 if (res == NULL)
9839 goto leave;
9840 tmpend = tmp + newlength;
9841 outdata = PyUnicode_DATA(res);
9842 outkind = PyUnicode_KIND(res);
9843 switch (outkind) {
9844 case PyUnicode_1BYTE_KIND:
9845 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9846 break;
9847 case PyUnicode_2BYTE_KIND:
9848 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9849 break;
9850 case PyUnicode_4BYTE_KIND:
9851 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9852 break;
9853 default:
9854 assert(0);
9855 break;
9856 }
9857 leave:
9858 PyMem_FREE(tmp);
9859 return res;
9860}
9861
Tim Peters8ce9f162004-08-27 01:49:32 +00009862PyObject *
9863PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009865 PyObject *res;
9866 PyObject *fseq;
9867 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009868 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009870 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009871 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009872 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009873 }
9874
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009875 /* NOTE: the following code can't call back into Python code,
9876 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009877 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009878
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009879 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009880 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009881 res = _PyUnicode_JoinArray(separator, items, seqlen);
9882 Py_DECREF(fseq);
9883 return res;
9884}
9885
9886PyObject *
9887_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9888{
9889 PyObject *res = NULL; /* the result */
9890 PyObject *sep = NULL;
9891 Py_ssize_t seplen;
9892 PyObject *item;
9893 Py_ssize_t sz, i, res_offset;
9894 Py_UCS4 maxchar;
9895 Py_UCS4 item_maxchar;
9896 int use_memcpy;
9897 unsigned char *res_data = NULL, *sep_data = NULL;
9898 PyObject *last_obj;
9899 unsigned int kind = 0;
9900
Tim Peters05eba1f2004-08-27 21:32:02 +00009901 /* If empty sequence, return u"". */
9902 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009903 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009904 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009905
Tim Peters05eba1f2004-08-27 21:32:02 +00009906 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009907 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009908 if (seqlen == 1) {
9909 if (PyUnicode_CheckExact(items[0])) {
9910 res = items[0];
9911 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009912 return res;
9913 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009914 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009915 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009916 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009917 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009918 /* Set up sep and seplen */
9919 if (separator == NULL) {
9920 /* fall back to a blank space separator */
9921 sep = PyUnicode_FromOrdinal(' ');
9922 if (!sep)
9923 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009924 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009925 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009926 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009927 else {
9928 if (!PyUnicode_Check(separator)) {
9929 PyErr_Format(PyExc_TypeError,
9930 "separator: expected str instance,"
9931 " %.80s found",
9932 Py_TYPE(separator)->tp_name);
9933 goto onError;
9934 }
9935 if (PyUnicode_READY(separator))
9936 goto onError;
9937 sep = separator;
9938 seplen = PyUnicode_GET_LENGTH(separator);
9939 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9940 /* inc refcount to keep this code path symmetric with the
9941 above case of a blank separator */
9942 Py_INCREF(sep);
9943 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009944 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009945 }
9946
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009947 /* There are at least two things to join, or else we have a subclass
9948 * of str in the sequence.
9949 * Do a pre-pass to figure out the total amount of space we'll
9950 * need (sz), and see whether all argument are strings.
9951 */
9952 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009953#ifdef Py_DEBUG
9954 use_memcpy = 0;
9955#else
9956 use_memcpy = 1;
9957#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009958 for (i = 0; i < seqlen; i++) {
9959 const Py_ssize_t old_sz = sz;
9960 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009961 if (!PyUnicode_Check(item)) {
9962 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009963 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009964 " %.80s found",
9965 i, Py_TYPE(item)->tp_name);
9966 goto onError;
9967 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 if (PyUnicode_READY(item) == -1)
9969 goto onError;
9970 sz += PyUnicode_GET_LENGTH(item);
9971 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009972 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009973 if (i != 0)
9974 sz += seplen;
9975 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9976 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009977 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009978 goto onError;
9979 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009980 if (use_memcpy && last_obj != NULL) {
9981 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9982 use_memcpy = 0;
9983 }
9984 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009985 }
Tim Petersced69f82003-09-16 20:30:58 +00009986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009988 if (res == NULL)
9989 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009990
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009991 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009992#ifdef Py_DEBUG
9993 use_memcpy = 0;
9994#else
9995 if (use_memcpy) {
9996 res_data = PyUnicode_1BYTE_DATA(res);
9997 kind = PyUnicode_KIND(res);
9998 if (seplen != 0)
9999 sep_data = PyUnicode_1BYTE_DATA(sep);
10000 }
10001#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010002 if (use_memcpy) {
10003 for (i = 0; i < seqlen; ++i) {
10004 Py_ssize_t itemlen;
10005 item = items[i];
10006
10007 /* Copy item, and maybe the separator. */
10008 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010009 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010010 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010011 kind * seplen);
10012 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010013 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010014
10015 itemlen = PyUnicode_GET_LENGTH(item);
10016 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010017 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010018 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010019 kind * itemlen);
10020 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010021 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010022 }
10023 assert(res_data == PyUnicode_1BYTE_DATA(res)
10024 + kind * PyUnicode_GET_LENGTH(res));
10025 }
10026 else {
10027 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10028 Py_ssize_t itemlen;
10029 item = items[i];
10030
10031 /* Copy item, and maybe the separator. */
10032 if (i && seplen != 0) {
10033 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10034 res_offset += seplen;
10035 }
10036
10037 itemlen = PyUnicode_GET_LENGTH(item);
10038 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010039 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010040 res_offset += itemlen;
10041 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010042 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010043 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010044 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010047 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010049
Benjamin Peterson29060642009-01-31 22:14:21 +000010050 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010052 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010053 return NULL;
10054}
10055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056#define FILL(kind, data, value, start, length) \
10057 do { \
10058 Py_ssize_t i_ = 0; \
10059 assert(kind != PyUnicode_WCHAR_KIND); \
10060 switch ((kind)) { \
10061 case PyUnicode_1BYTE_KIND: { \
10062 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010063 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 break; \
10065 } \
10066 case PyUnicode_2BYTE_KIND: { \
10067 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10068 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10069 break; \
10070 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010071 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10073 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10074 break; \
10075 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010076 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 } \
10078 } while (0)
10079
Victor Stinnerd3f08822012-05-29 12:57:52 +020010080void
10081_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10082 Py_UCS4 fill_char)
10083{
10084 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10085 const void *data = PyUnicode_DATA(unicode);
10086 assert(PyUnicode_IS_READY(unicode));
10087 assert(unicode_modifiable(unicode));
10088 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10089 assert(start >= 0);
10090 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10091 FILL(kind, data, fill_char, start, length);
10092}
10093
Victor Stinner3fe55312012-01-04 00:33:50 +010010094Py_ssize_t
10095PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10096 Py_UCS4 fill_char)
10097{
10098 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010099
10100 if (!PyUnicode_Check(unicode)) {
10101 PyErr_BadInternalCall();
10102 return -1;
10103 }
10104 if (PyUnicode_READY(unicode) == -1)
10105 return -1;
10106 if (unicode_check_modifiable(unicode))
10107 return -1;
10108
Victor Stinnerd3f08822012-05-29 12:57:52 +020010109 if (start < 0) {
10110 PyErr_SetString(PyExc_IndexError, "string index out of range");
10111 return -1;
10112 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010113 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10114 PyErr_SetString(PyExc_ValueError,
10115 "fill character is bigger than "
10116 "the string maximum character");
10117 return -1;
10118 }
10119
10120 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10121 length = Py_MIN(maxlen, length);
10122 if (length <= 0)
10123 return 0;
10124
Victor Stinnerd3f08822012-05-29 12:57:52 +020010125 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010126 return length;
10127}
10128
Victor Stinner9310abb2011-10-05 00:59:23 +020010129static PyObject *
10130pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010131 Py_ssize_t left,
10132 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010134{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 PyObject *u;
10136 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010137 int kind;
10138 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139
10140 if (left < 0)
10141 left = 0;
10142 if (right < 0)
10143 right = 0;
10144
Victor Stinnerc4b49542011-12-11 22:44:26 +010010145 if (left == 0 && right == 0)
10146 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10149 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010150 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10151 return NULL;
10152 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010154 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010156 if (!u)
10157 return NULL;
10158
10159 kind = PyUnicode_KIND(u);
10160 data = PyUnicode_DATA(u);
10161 if (left)
10162 FILL(kind, data, fill, 0, left);
10163 if (right)
10164 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010165 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010166 assert(_PyUnicode_CheckConsistency(u, 1));
10167 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168}
10169
Alexander Belopolsky40018472011-02-26 01:02:56 +000010170PyObject *
10171PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010175 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010176 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177
Benjamin Petersonead6b532011-12-20 17:23:42 -060010178 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010180 if (PyUnicode_IS_ASCII(string))
10181 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010182 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010183 PyUnicode_GET_LENGTH(string), keepends);
10184 else
10185 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010186 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010187 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 break;
10189 case PyUnicode_2BYTE_KIND:
10190 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010191 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 PyUnicode_GET_LENGTH(string), keepends);
10193 break;
10194 case PyUnicode_4BYTE_KIND:
10195 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010196 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 PyUnicode_GET_LENGTH(string), keepends);
10198 break;
10199 default:
10200 assert(0);
10201 list = 0;
10202 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204}
10205
Alexander Belopolsky40018472011-02-26 01:02:56 +000010206static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010207split(PyObject *self,
10208 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010209 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010211 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 void *buf1, *buf2;
10213 Py_ssize_t len1, len2;
10214 PyObject* out;
10215
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010217 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 if (PyUnicode_READY(self) == -1)
10220 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010223 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010225 if (PyUnicode_IS_ASCII(self))
10226 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010227 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010228 PyUnicode_GET_LENGTH(self), maxcount
10229 );
10230 else
10231 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010232 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010233 PyUnicode_GET_LENGTH(self), maxcount
10234 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 case PyUnicode_2BYTE_KIND:
10236 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010237 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 PyUnicode_GET_LENGTH(self), maxcount
10239 );
10240 case PyUnicode_4BYTE_KIND:
10241 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010242 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 PyUnicode_GET_LENGTH(self), maxcount
10244 );
10245 default:
10246 assert(0);
10247 return NULL;
10248 }
10249
10250 if (PyUnicode_READY(substring) == -1)
10251 return NULL;
10252
10253 kind1 = PyUnicode_KIND(self);
10254 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 len1 = PyUnicode_GET_LENGTH(self);
10256 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010257 if (kind1 < kind2 || len1 < len2) {
10258 out = PyList_New(1);
10259 if (out == NULL)
10260 return NULL;
10261 Py_INCREF(self);
10262 PyList_SET_ITEM(out, 0, self);
10263 return out;
10264 }
10265 buf1 = PyUnicode_DATA(self);
10266 buf2 = PyUnicode_DATA(substring);
10267 if (kind2 != kind1) {
10268 buf2 = _PyUnicode_AsKind(substring, kind1);
10269 if (!buf2)
10270 return NULL;
10271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010273 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010275 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10276 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010277 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010278 else
10279 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010280 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 break;
10282 case PyUnicode_2BYTE_KIND:
10283 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010284 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 break;
10286 case PyUnicode_4BYTE_KIND:
10287 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010288 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 break;
10290 default:
10291 out = NULL;
10292 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010293 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 PyMem_Free(buf2);
10295 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010296}
10297
Alexander Belopolsky40018472011-02-26 01:02:56 +000010298static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010299rsplit(PyObject *self,
10300 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010301 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010302{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010303 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 void *buf1, *buf2;
10305 Py_ssize_t len1, len2;
10306 PyObject* out;
10307
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010308 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010309 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 if (PyUnicode_READY(self) == -1)
10312 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010315 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010317 if (PyUnicode_IS_ASCII(self))
10318 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010319 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010320 PyUnicode_GET_LENGTH(self), maxcount
10321 );
10322 else
10323 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010324 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010325 PyUnicode_GET_LENGTH(self), maxcount
10326 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 case PyUnicode_2BYTE_KIND:
10328 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010329 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 PyUnicode_GET_LENGTH(self), maxcount
10331 );
10332 case PyUnicode_4BYTE_KIND:
10333 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010334 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 PyUnicode_GET_LENGTH(self), maxcount
10336 );
10337 default:
10338 assert(0);
10339 return NULL;
10340 }
10341
10342 if (PyUnicode_READY(substring) == -1)
10343 return NULL;
10344
10345 kind1 = PyUnicode_KIND(self);
10346 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 len1 = PyUnicode_GET_LENGTH(self);
10348 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010349 if (kind1 < kind2 || len1 < len2) {
10350 out = PyList_New(1);
10351 if (out == NULL)
10352 return NULL;
10353 Py_INCREF(self);
10354 PyList_SET_ITEM(out, 0, self);
10355 return out;
10356 }
10357 buf1 = PyUnicode_DATA(self);
10358 buf2 = PyUnicode_DATA(substring);
10359 if (kind2 != kind1) {
10360 buf2 = _PyUnicode_AsKind(substring, kind1);
10361 if (!buf2)
10362 return NULL;
10363 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010365 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010367 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10368 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010369 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010370 else
10371 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010372 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 break;
10374 case PyUnicode_2BYTE_KIND:
10375 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010376 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 break;
10378 case PyUnicode_4BYTE_KIND:
10379 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010380 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 break;
10382 default:
10383 out = NULL;
10384 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010385 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 PyMem_Free(buf2);
10387 return out;
10388}
10389
10390static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010391anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10392 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010394 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010396 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10397 return asciilib_find(buf1, len1, buf2, len2, offset);
10398 else
10399 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 case PyUnicode_2BYTE_KIND:
10401 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10402 case PyUnicode_4BYTE_KIND:
10403 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10404 }
10405 assert(0);
10406 return -1;
10407}
10408
10409static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010410anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10411 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010413 switch (kind) {
10414 case PyUnicode_1BYTE_KIND:
10415 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10416 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10417 else
10418 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10419 case PyUnicode_2BYTE_KIND:
10420 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10421 case PyUnicode_4BYTE_KIND:
10422 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10423 }
10424 assert(0);
10425 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010426}
10427
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010428static void
10429replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10430 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10431{
10432 int kind = PyUnicode_KIND(u);
10433 void *data = PyUnicode_DATA(u);
10434 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10435 if (kind == PyUnicode_1BYTE_KIND) {
10436 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10437 (Py_UCS1 *)data + len,
10438 u1, u2, maxcount);
10439 }
10440 else if (kind == PyUnicode_2BYTE_KIND) {
10441 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10442 (Py_UCS2 *)data + len,
10443 u1, u2, maxcount);
10444 }
10445 else {
10446 assert(kind == PyUnicode_4BYTE_KIND);
10447 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10448 (Py_UCS4 *)data + len,
10449 u1, u2, maxcount);
10450 }
10451}
10452
Alexander Belopolsky40018472011-02-26 01:02:56 +000010453static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454replace(PyObject *self, PyObject *str1,
10455 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010456{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 PyObject *u;
10458 char *sbuf = PyUnicode_DATA(self);
10459 char *buf1 = PyUnicode_DATA(str1);
10460 char *buf2 = PyUnicode_DATA(str2);
10461 int srelease = 0, release1 = 0, release2 = 0;
10462 int skind = PyUnicode_KIND(self);
10463 int kind1 = PyUnicode_KIND(str1);
10464 int kind2 = PyUnicode_KIND(str2);
10465 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10466 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10467 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010468 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010469 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010470
10471 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010472 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010474 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010475
Victor Stinner59de0ee2011-10-07 10:01:28 +020010476 if (str1 == str2)
10477 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478
Victor Stinner49a0a212011-10-12 23:46:10 +020010479 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010480 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10481 if (maxchar < maxchar_str1)
10482 /* substring too wide to be present */
10483 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010484 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10485 /* Replacing str1 with str2 may cause a maxchar reduction in the
10486 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010487 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010488 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010491 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010493 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010495 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010496 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010497 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010498
Victor Stinner69ed0f42013-04-09 21:48:24 +020010499 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010500 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010501 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010502 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010503 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010505 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010507
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010508 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10509 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010510 }
10511 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 int rkind = skind;
10513 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010514 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 if (kind1 < rkind) {
10517 /* widen substring */
10518 buf1 = _PyUnicode_AsKind(str1, rkind);
10519 if (!buf1) goto error;
10520 release1 = 1;
10521 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010522 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010523 if (i < 0)
10524 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 if (rkind > kind2) {
10526 /* widen replacement */
10527 buf2 = _PyUnicode_AsKind(str2, rkind);
10528 if (!buf2) goto error;
10529 release2 = 1;
10530 }
10531 else if (rkind < kind2) {
10532 /* widen self and buf1 */
10533 rkind = kind2;
10534 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010535 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 sbuf = _PyUnicode_AsKind(self, rkind);
10537 if (!sbuf) goto error;
10538 srelease = 1;
10539 buf1 = _PyUnicode_AsKind(str1, rkind);
10540 if (!buf1) goto error;
10541 release1 = 1;
10542 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010543 u = PyUnicode_New(slen, maxchar);
10544 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010546 assert(PyUnicode_KIND(u) == rkind);
10547 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010548
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010549 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010550 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010551 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010553 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010555
10556 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010557 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010558 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010559 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010560 if (i == -1)
10561 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010562 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010564 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010566 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010567 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010568 }
10569 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010571 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 int rkind = skind;
10573 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010576 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 buf1 = _PyUnicode_AsKind(str1, rkind);
10578 if (!buf1) goto error;
10579 release1 = 1;
10580 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010581 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010582 if (n == 0)
10583 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010585 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 buf2 = _PyUnicode_AsKind(str2, rkind);
10587 if (!buf2) goto error;
10588 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010591 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 rkind = kind2;
10593 sbuf = _PyUnicode_AsKind(self, rkind);
10594 if (!sbuf) goto error;
10595 srelease = 1;
10596 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010597 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 buf1 = _PyUnicode_AsKind(str1, rkind);
10599 if (!buf1) goto error;
10600 release1 = 1;
10601 }
10602 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10603 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010604 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 PyErr_SetString(PyExc_OverflowError,
10606 "replace string is too long");
10607 goto error;
10608 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010609 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010610 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010611 _Py_INCREF_UNICODE_EMPTY();
10612 if (!unicode_empty)
10613 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010614 u = unicode_empty;
10615 goto done;
10616 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010617 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 PyErr_SetString(PyExc_OverflowError,
10619 "replace string is too long");
10620 goto error;
10621 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010622 u = PyUnicode_New(new_size, maxchar);
10623 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010625 assert(PyUnicode_KIND(u) == rkind);
10626 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 ires = i = 0;
10628 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010629 while (n-- > 0) {
10630 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010631 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010632 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010633 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010634 if (j == -1)
10635 break;
10636 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010637 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010638 memcpy(res + rkind * ires,
10639 sbuf + rkind * i,
10640 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010642 }
10643 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010645 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010647 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010651 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010653 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010654 memcpy(res + rkind * ires,
10655 sbuf + rkind * i,
10656 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010657 }
10658 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010659 /* interleave */
10660 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010661 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010663 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010665 if (--n <= 0)
10666 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010667 memcpy(res + rkind * ires,
10668 sbuf + rkind * i,
10669 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 ires++;
10671 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010672 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010673 memcpy(res + rkind * ires,
10674 sbuf + rkind * i,
10675 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010676 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010677 }
10678
10679 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010680 unicode_adjust_maxchar(&u);
10681 if (u == NULL)
10682 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010684
10685 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 if (srelease)
10687 PyMem_FREE(sbuf);
10688 if (release1)
10689 PyMem_FREE(buf1);
10690 if (release2)
10691 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010692 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010694
Benjamin Peterson29060642009-01-31 22:14:21 +000010695 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010696 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 if (srelease)
10698 PyMem_FREE(sbuf);
10699 if (release1)
10700 PyMem_FREE(buf1);
10701 if (release2)
10702 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010703 return unicode_result_unchanged(self);
10704
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 error:
10706 if (srelease && sbuf)
10707 PyMem_FREE(sbuf);
10708 if (release1 && buf1)
10709 PyMem_FREE(buf1);
10710 if (release2 && buf2)
10711 PyMem_FREE(buf2);
10712 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713}
10714
10715/* --- Unicode Object Methods --------------------------------------------- */
10716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010717PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010718 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719\n\
10720Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010721characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722
10723static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010724unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010725{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010726 if (PyUnicode_READY(self) == -1)
10727 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010728 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729}
10730
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010731PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010732 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733\n\
10734Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010735have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736
10737static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010738unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010740 if (PyUnicode_READY(self) == -1)
10741 return NULL;
10742 if (PyUnicode_GET_LENGTH(self) == 0)
10743 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010744 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745}
10746
Benjamin Petersond5890c82012-01-14 13:23:30 -050010747PyDoc_STRVAR(casefold__doc__,
10748 "S.casefold() -> str\n\
10749\n\
10750Return a version of S suitable for caseless comparisons.");
10751
10752static PyObject *
10753unicode_casefold(PyObject *self)
10754{
10755 if (PyUnicode_READY(self) == -1)
10756 return NULL;
10757 if (PyUnicode_IS_ASCII(self))
10758 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010759 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010760}
10761
10762
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010763/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010764
10765static int
10766convert_uc(PyObject *obj, void *addr)
10767{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010769
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010770 if (!PyUnicode_Check(obj)) {
10771 PyErr_Format(PyExc_TypeError,
10772 "The fill character must be a unicode character, "
10773 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010774 return 0;
10775 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010776 if (PyUnicode_READY(obj) < 0)
10777 return 0;
10778 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010779 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010780 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010781 return 0;
10782 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010783 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010784 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010785}
10786
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010787PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010788 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010789\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010790Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010791done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010792
10793static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010794unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010795{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010796 Py_ssize_t marg, left;
10797 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 Py_UCS4 fillchar = ' ';
10799
Victor Stinnere9a29352011-10-01 02:14:59 +020010800 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010802
Benjamin Petersonbac79492012-01-14 13:34:47 -050010803 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010804 return NULL;
10805
Victor Stinnerc4b49542011-12-11 22:44:26 +010010806 if (PyUnicode_GET_LENGTH(self) >= width)
10807 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808
Victor Stinnerc4b49542011-12-11 22:44:26 +010010809 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010810 left = marg / 2 + (marg & width & 1);
10811
Victor Stinner9310abb2011-10-05 00:59:23 +020010812 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813}
10814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815/* This function assumes that str1 and str2 are readied by the caller. */
10816
Marc-André Lemburge5034372000-08-08 08:04:29 +000010817static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010818unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010819{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010820#define COMPARE(TYPE1, TYPE2) \
10821 do { \
10822 TYPE1* p1 = (TYPE1 *)data1; \
10823 TYPE2* p2 = (TYPE2 *)data2; \
10824 TYPE1* end = p1 + len; \
10825 Py_UCS4 c1, c2; \
10826 for (; p1 != end; p1++, p2++) { \
10827 c1 = *p1; \
10828 c2 = *p2; \
10829 if (c1 != c2) \
10830 return (c1 < c2) ? -1 : 1; \
10831 } \
10832 } \
10833 while (0)
10834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 int kind1, kind2;
10836 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010837 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839 kind1 = PyUnicode_KIND(str1);
10840 kind2 = PyUnicode_KIND(str2);
10841 data1 = PyUnicode_DATA(str1);
10842 data2 = PyUnicode_DATA(str2);
10843 len1 = PyUnicode_GET_LENGTH(str1);
10844 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010845 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010846
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010847 switch(kind1) {
10848 case PyUnicode_1BYTE_KIND:
10849 {
10850 switch(kind2) {
10851 case PyUnicode_1BYTE_KIND:
10852 {
10853 int cmp = memcmp(data1, data2, len);
10854 /* normalize result of memcmp() into the range [-1; 1] */
10855 if (cmp < 0)
10856 return -1;
10857 if (cmp > 0)
10858 return 1;
10859 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010860 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010861 case PyUnicode_2BYTE_KIND:
10862 COMPARE(Py_UCS1, Py_UCS2);
10863 break;
10864 case PyUnicode_4BYTE_KIND:
10865 COMPARE(Py_UCS1, Py_UCS4);
10866 break;
10867 default:
10868 assert(0);
10869 }
10870 break;
10871 }
10872 case PyUnicode_2BYTE_KIND:
10873 {
10874 switch(kind2) {
10875 case PyUnicode_1BYTE_KIND:
10876 COMPARE(Py_UCS2, Py_UCS1);
10877 break;
10878 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010879 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010880 COMPARE(Py_UCS2, Py_UCS2);
10881 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010882 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010883 case PyUnicode_4BYTE_KIND:
10884 COMPARE(Py_UCS2, Py_UCS4);
10885 break;
10886 default:
10887 assert(0);
10888 }
10889 break;
10890 }
10891 case PyUnicode_4BYTE_KIND:
10892 {
10893 switch(kind2) {
10894 case PyUnicode_1BYTE_KIND:
10895 COMPARE(Py_UCS4, Py_UCS1);
10896 break;
10897 case PyUnicode_2BYTE_KIND:
10898 COMPARE(Py_UCS4, Py_UCS2);
10899 break;
10900 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010901 {
10902#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10903 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10904 /* normalize result of wmemcmp() into the range [-1; 1] */
10905 if (cmp < 0)
10906 return -1;
10907 if (cmp > 0)
10908 return 1;
10909#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010910 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010911#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010912 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010913 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010914 default:
10915 assert(0);
10916 }
10917 break;
10918 }
10919 default:
10920 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010921 }
10922
Victor Stinner770e19e2012-10-04 22:59:45 +020010923 if (len1 == len2)
10924 return 0;
10925 if (len1 < len2)
10926 return -1;
10927 else
10928 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010929
10930#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010931}
10932
Benjamin Peterson621b4302016-09-09 13:54:34 -070010933static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010934unicode_compare_eq(PyObject *str1, PyObject *str2)
10935{
10936 int kind;
10937 void *data1, *data2;
10938 Py_ssize_t len;
10939 int cmp;
10940
Victor Stinnere5567ad2012-10-23 02:48:49 +020010941 len = PyUnicode_GET_LENGTH(str1);
10942 if (PyUnicode_GET_LENGTH(str2) != len)
10943 return 0;
10944 kind = PyUnicode_KIND(str1);
10945 if (PyUnicode_KIND(str2) != kind)
10946 return 0;
10947 data1 = PyUnicode_DATA(str1);
10948 data2 = PyUnicode_DATA(str2);
10949
10950 cmp = memcmp(data1, data2, len * kind);
10951 return (cmp == 0);
10952}
10953
10954
Alexander Belopolsky40018472011-02-26 01:02:56 +000010955int
10956PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10959 if (PyUnicode_READY(left) == -1 ||
10960 PyUnicode_READY(right) == -1)
10961 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010962
10963 /* a string is equal to itself */
10964 if (left == right)
10965 return 0;
10966
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010967 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010969 PyErr_Format(PyExc_TypeError,
10970 "Can't compare %.100s and %.100s",
10971 left->ob_type->tp_name,
10972 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973 return -1;
10974}
10975
Martin v. Löwis5b222132007-06-10 09:51:05 +000010976int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010977_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10978{
10979 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10980 if (right_str == NULL)
10981 return -1;
10982 return PyUnicode_Compare(left, right_str);
10983}
10984
10985int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010986PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10987{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 Py_ssize_t i;
10989 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010990 Py_UCS4 chr;
10991
Victor Stinner910337b2011-10-03 03:20:16 +020010992 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010993 if (PyUnicode_READY(uni) == -1)
10994 return -1;
10995 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010996 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010997 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010998 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010999 size_t len, len2 = strlen(str);
11000 int cmp;
11001
11002 len = Py_MIN(len1, len2);
11003 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011004 if (cmp != 0) {
11005 if (cmp < 0)
11006 return -1;
11007 else
11008 return 1;
11009 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011010 if (len1 > len2)
11011 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011012 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011013 return -1; /* str is longer */
11014 return 0;
11015 }
11016 else {
11017 void *data = PyUnicode_DATA(uni);
11018 /* Compare Unicode string and source character set string */
11019 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011020 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011021 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11022 /* This check keeps Python strings that end in '\0' from comparing equal
11023 to C strings identical up to that point. */
11024 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11025 return 1; /* uni is longer */
11026 if (str[i])
11027 return -1; /* str is longer */
11028 return 0;
11029 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011030}
11031
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011032
Benjamin Peterson29060642009-01-31 22:14:21 +000011033#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011034 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011035
Alexander Belopolsky40018472011-02-26 01:02:56 +000011036PyObject *
11037PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011038{
11039 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011040 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011041
Victor Stinnere5567ad2012-10-23 02:48:49 +020011042 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11043 Py_RETURN_NOTIMPLEMENTED;
11044
11045 if (PyUnicode_READY(left) == -1 ||
11046 PyUnicode_READY(right) == -1)
11047 return NULL;
11048
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011049 if (left == right) {
11050 switch (op) {
11051 case Py_EQ:
11052 case Py_LE:
11053 case Py_GE:
11054 /* a string is equal to itself */
11055 v = Py_True;
11056 break;
11057 case Py_NE:
11058 case Py_LT:
11059 case Py_GT:
11060 v = Py_False;
11061 break;
11062 default:
11063 PyErr_BadArgument();
11064 return NULL;
11065 }
11066 }
11067 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011068 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011069 result ^= (op == Py_NE);
11070 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011071 }
11072 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011073 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011074
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011075 /* Convert the return value to a Boolean */
11076 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011077 case Py_LE:
11078 v = TEST_COND(result <= 0);
11079 break;
11080 case Py_GE:
11081 v = TEST_COND(result >= 0);
11082 break;
11083 case Py_LT:
11084 v = TEST_COND(result == -1);
11085 break;
11086 case Py_GT:
11087 v = TEST_COND(result == 1);
11088 break;
11089 default:
11090 PyErr_BadArgument();
11091 return NULL;
11092 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011093 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011094 Py_INCREF(v);
11095 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011096}
11097
Alexander Belopolsky40018472011-02-26 01:02:56 +000011098int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011099_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11100{
11101 return unicode_eq(aa, bb);
11102}
11103
11104int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011105PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011106{
Victor Stinner77282cb2013-04-14 19:22:47 +020011107 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011108 void *buf1, *buf2;
11109 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011110 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011111
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011112 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011113 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011114 "'in <string>' requires string as left operand, not %.100s",
11115 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011116 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011117 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011118 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011119 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011120 if (ensure_unicode(str) < 0)
11121 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011123 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011124 kind2 = PyUnicode_KIND(substr);
11125 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011126 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011127 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011128 len2 = PyUnicode_GET_LENGTH(substr);
11129 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011130 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011131 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011132 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011133 if (len2 == 1) {
11134 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11135 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011136 return result;
11137 }
11138 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011139 buf2 = _PyUnicode_AsKind(substr, kind1);
11140 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011141 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011142 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143
Victor Stinner77282cb2013-04-14 19:22:47 +020011144 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011145 case PyUnicode_1BYTE_KIND:
11146 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11147 break;
11148 case PyUnicode_2BYTE_KIND:
11149 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11150 break;
11151 case PyUnicode_4BYTE_KIND:
11152 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11153 break;
11154 default:
11155 result = -1;
11156 assert(0);
11157 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011158
Victor Stinner77282cb2013-04-14 19:22:47 +020011159 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160 PyMem_Free(buf2);
11161
Guido van Rossum403d68b2000-03-13 15:55:09 +000011162 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011163}
11164
Guido van Rossumd57fd912000-03-10 22:53:23 +000011165/* Concat to string or Unicode object giving a new Unicode object. */
11166
Alexander Belopolsky40018472011-02-26 01:02:56 +000011167PyObject *
11168PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011170 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011171 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011172 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011174 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11175 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176
11177 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011178 if (left == unicode_empty)
11179 return PyUnicode_FromObject(right);
11180 if (right == unicode_empty)
11181 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011183 left_len = PyUnicode_GET_LENGTH(left);
11184 right_len = PyUnicode_GET_LENGTH(right);
11185 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011186 PyErr_SetString(PyExc_OverflowError,
11187 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011188 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011189 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011190 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011191
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011192 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11193 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011194 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011197 result = PyUnicode_New(new_len, maxchar);
11198 if (result == NULL)
11199 return NULL;
11200 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11201 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11202 assert(_PyUnicode_CheckConsistency(result, 1));
11203 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204}
11205
Walter Dörwald1ab83302007-05-18 17:15:44 +000011206void
Victor Stinner23e56682011-10-03 03:54:37 +020011207PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011208{
Victor Stinner23e56682011-10-03 03:54:37 +020011209 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011210 Py_UCS4 maxchar, maxchar2;
11211 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011212
11213 if (p_left == NULL) {
11214 if (!PyErr_Occurred())
11215 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011216 return;
11217 }
Victor Stinner23e56682011-10-03 03:54:37 +020011218 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011219 if (right == NULL || left == NULL
11220 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011221 if (!PyErr_Occurred())
11222 PyErr_BadInternalCall();
11223 goto error;
11224 }
11225
Benjamin Petersonbac79492012-01-14 13:34:47 -050011226 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011227 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011228 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011229 goto error;
11230
Victor Stinner488fa492011-12-12 00:01:39 +010011231 /* Shortcuts */
11232 if (left == unicode_empty) {
11233 Py_DECREF(left);
11234 Py_INCREF(right);
11235 *p_left = right;
11236 return;
11237 }
11238 if (right == unicode_empty)
11239 return;
11240
11241 left_len = PyUnicode_GET_LENGTH(left);
11242 right_len = PyUnicode_GET_LENGTH(right);
11243 if (left_len > PY_SSIZE_T_MAX - right_len) {
11244 PyErr_SetString(PyExc_OverflowError,
11245 "strings are too large to concat");
11246 goto error;
11247 }
11248 new_len = left_len + right_len;
11249
11250 if (unicode_modifiable(left)
11251 && PyUnicode_CheckExact(right)
11252 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011253 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11254 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011255 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011256 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011257 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11258 {
11259 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011260 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011261 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011262
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011263 /* copy 'right' into the newly allocated area of 'left' */
11264 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011265 }
Victor Stinner488fa492011-12-12 00:01:39 +010011266 else {
11267 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11268 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011269 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011270
Victor Stinner488fa492011-12-12 00:01:39 +010011271 /* Concat the two Unicode strings */
11272 res = PyUnicode_New(new_len, maxchar);
11273 if (res == NULL)
11274 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011275 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11276 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011277 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011278 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011279 }
11280 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011281 return;
11282
11283error:
Victor Stinner488fa492011-12-12 00:01:39 +010011284 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011285}
11286
11287void
11288PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11289{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011290 PyUnicode_Append(pleft, right);
11291 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011292}
11293
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011294/*
11295Wraps stringlib_parse_args_finds() and additionally ensures that the
11296first argument is a unicode object.
11297*/
11298
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011299static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011300parse_args_finds_unicode(const char * function_name, PyObject *args,
11301 PyObject **substring,
11302 Py_ssize_t *start, Py_ssize_t *end)
11303{
11304 if(stringlib_parse_args_finds(function_name, args, substring,
11305 start, end)) {
11306 if (ensure_unicode(*substring) < 0)
11307 return 0;
11308 return 1;
11309 }
11310 return 0;
11311}
11312
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011313PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011314 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011316Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011317string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011318interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319
11320static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011321unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011323 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011324 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011325 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011327 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328 void *buf1, *buf2;
11329 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011331 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011332 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 kind1 = PyUnicode_KIND(self);
11335 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011336 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011337 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 len1 = PyUnicode_GET_LENGTH(self);
11340 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011342 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011343 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011344
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011345 buf1 = PyUnicode_DATA(self);
11346 buf2 = PyUnicode_DATA(substring);
11347 if (kind2 != kind1) {
11348 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011349 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011350 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011351 }
11352 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 case PyUnicode_1BYTE_KIND:
11354 iresult = ucs1lib_count(
11355 ((Py_UCS1*)buf1) + start, end - start,
11356 buf2, len2, PY_SSIZE_T_MAX
11357 );
11358 break;
11359 case PyUnicode_2BYTE_KIND:
11360 iresult = ucs2lib_count(
11361 ((Py_UCS2*)buf1) + start, end - start,
11362 buf2, len2, PY_SSIZE_T_MAX
11363 );
11364 break;
11365 case PyUnicode_4BYTE_KIND:
11366 iresult = ucs4lib_count(
11367 ((Py_UCS4*)buf1) + start, end - start,
11368 buf2, len2, PY_SSIZE_T_MAX
11369 );
11370 break;
11371 default:
11372 assert(0); iresult = 0;
11373 }
11374
11375 result = PyLong_FromSsize_t(iresult);
11376
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011377 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380 return result;
11381}
11382
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011383PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011384 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011386Encode S using the codec registered for encoding. Default encoding\n\
11387is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011388handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011389a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11390'xmlcharrefreplace' as well as any other name registered with\n\
11391codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392
11393static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011394unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011396 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397 char *encoding = NULL;
11398 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011399
Benjamin Peterson308d6372009-09-18 21:42:35 +000011400 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11401 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011403 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011404}
11405
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011406PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011407 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408\n\
11409Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011410If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411
11412static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011413unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011415 Py_ssize_t i, j, line_pos, src_len, incr;
11416 Py_UCS4 ch;
11417 PyObject *u;
11418 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011419 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011421 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011422 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423
Ezio Melotti745d54d2013-11-16 19:10:57 +020011424 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11425 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011426 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427
Antoine Pitrou22425222011-10-04 19:10:51 +020011428 if (PyUnicode_READY(self) == -1)
11429 return NULL;
11430
Thomas Wouters7e474022000-07-16 12:04:32 +000011431 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011432 src_len = PyUnicode_GET_LENGTH(self);
11433 i = j = line_pos = 0;
11434 kind = PyUnicode_KIND(self);
11435 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011436 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011437 for (; i < src_len; i++) {
11438 ch = PyUnicode_READ(kind, src_data, i);
11439 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011440 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011441 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011442 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011443 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011444 goto overflow;
11445 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011446 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011447 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011448 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011450 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011451 goto overflow;
11452 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011454 if (ch == '\n' || ch == '\r')
11455 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011457 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011458 if (!found)
11459 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011460
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011462 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463 if (!u)
11464 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011465 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466
Antoine Pitroue71d5742011-10-04 15:55:09 +020011467 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468
Antoine Pitroue71d5742011-10-04 15:55:09 +020011469 for (; i < src_len; i++) {
11470 ch = PyUnicode_READ(kind, src_data, i);
11471 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011472 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011473 incr = tabsize - (line_pos % tabsize);
11474 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011475 FILL(kind, dest_data, ' ', j, incr);
11476 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011477 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011478 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011479 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011480 line_pos++;
11481 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011482 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011483 if (ch == '\n' || ch == '\r')
11484 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011486 }
11487 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011488 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011489
Antoine Pitroue71d5742011-10-04 15:55:09 +020011490 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011491 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11492 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493}
11494
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011495PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011496 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497\n\
11498Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011499such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500arguments start and end are interpreted as in slice notation.\n\
11501\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011502Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503
11504static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011507 /* initialize variables to prevent gcc warning */
11508 PyObject *substring = NULL;
11509 Py_ssize_t start = 0;
11510 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011511 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011513 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011516 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011519 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 if (result == -2)
11522 return NULL;
11523
Christian Heimes217cfd12007-12-02 14:31:20 +000011524 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525}
11526
11527static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011528unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011530 void *data;
11531 enum PyUnicode_Kind kind;
11532 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011533
11534 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11535 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011537 }
11538 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11539 PyErr_SetString(PyExc_IndexError, "string index out of range");
11540 return NULL;
11541 }
11542 kind = PyUnicode_KIND(self);
11543 data = PyUnicode_DATA(self);
11544 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011545 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546}
11547
Guido van Rossumc2504932007-09-18 19:42:40 +000011548/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011549 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011550static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011551unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552{
Guido van Rossumc2504932007-09-18 19:42:40 +000011553 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011554 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011555
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011556#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011557 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011558#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559 if (_PyUnicode_HASH(self) != -1)
11560 return _PyUnicode_HASH(self);
11561 if (PyUnicode_READY(self) == -1)
11562 return -1;
11563 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011564 /*
11565 We make the hash of the empty string be 0, rather than using
11566 (prefix ^ suffix), since this slightly obfuscates the hash secret
11567 */
11568 if (len == 0) {
11569 _PyUnicode_HASH(self) = 0;
11570 return 0;
11571 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011572 x = _Py_HashBytes(PyUnicode_DATA(self),
11573 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011575 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576}
11577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011578PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011579 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011581Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582
11583static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011586 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011587 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011588 PyObject *substring = NULL;
11589 Py_ssize_t start = 0;
11590 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011592 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011595 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011597
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011598 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600 if (result == -2)
11601 return NULL;
11602
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603 if (result < 0) {
11604 PyErr_SetString(PyExc_ValueError, "substring not found");
11605 return NULL;
11606 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011607
Christian Heimes217cfd12007-12-02 14:31:20 +000011608 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609}
11610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011611PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011612 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011614Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011615at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616
11617static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011618unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 Py_ssize_t i, length;
11621 int kind;
11622 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623 int cased;
11624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 if (PyUnicode_READY(self) == -1)
11626 return NULL;
11627 length = PyUnicode_GET_LENGTH(self);
11628 kind = PyUnicode_KIND(self);
11629 data = PyUnicode_DATA(self);
11630
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 if (length == 1)
11633 return PyBool_FromLong(
11634 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011636 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011638 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011639
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 for (i = 0; i < length; i++) {
11642 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011643
Benjamin Peterson29060642009-01-31 22:14:21 +000011644 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11645 return PyBool_FromLong(0);
11646 else if (!cased && Py_UNICODE_ISLOWER(ch))
11647 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011649 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650}
11651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011652PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011653 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011655Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011656at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657
11658static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011659unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 Py_ssize_t i, length;
11662 int kind;
11663 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664 int cased;
11665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666 if (PyUnicode_READY(self) == -1)
11667 return NULL;
11668 length = PyUnicode_GET_LENGTH(self);
11669 kind = PyUnicode_KIND(self);
11670 data = PyUnicode_DATA(self);
11671
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 if (length == 1)
11674 return PyBool_FromLong(
11675 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011677 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011679 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011680
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 for (i = 0; i < length; i++) {
11683 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011684
Benjamin Peterson29060642009-01-31 22:14:21 +000011685 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11686 return PyBool_FromLong(0);
11687 else if (!cased && Py_UNICODE_ISUPPER(ch))
11688 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011690 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691}
11692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011693PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011694 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011696Return True if S is a titlecased string and there is at least one\n\
11697character in S, i.e. upper- and titlecase characters may only\n\
11698follow uncased characters and lowercase characters only cased ones.\n\
11699Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700
11701static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011702unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 Py_ssize_t i, length;
11705 int kind;
11706 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707 int cased, previous_is_cased;
11708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 if (PyUnicode_READY(self) == -1)
11710 return NULL;
11711 length = PyUnicode_GET_LENGTH(self);
11712 kind = PyUnicode_KIND(self);
11713 data = PyUnicode_DATA(self);
11714
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 if (length == 1) {
11717 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11718 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11719 (Py_UNICODE_ISUPPER(ch) != 0));
11720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011722 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011724 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011725
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726 cased = 0;
11727 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 for (i = 0; i < length; i++) {
11729 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011730
Benjamin Peterson29060642009-01-31 22:14:21 +000011731 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11732 if (previous_is_cased)
11733 return PyBool_FromLong(0);
11734 previous_is_cased = 1;
11735 cased = 1;
11736 }
11737 else if (Py_UNICODE_ISLOWER(ch)) {
11738 if (!previous_is_cased)
11739 return PyBool_FromLong(0);
11740 previous_is_cased = 1;
11741 cased = 1;
11742 }
11743 else
11744 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011746 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747}
11748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011749PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011750 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011752Return True if all characters in S are whitespace\n\
11753and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754
11755static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011756unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 Py_ssize_t i, length;
11759 int kind;
11760 void *data;
11761
11762 if (PyUnicode_READY(self) == -1)
11763 return NULL;
11764 length = PyUnicode_GET_LENGTH(self);
11765 kind = PyUnicode_KIND(self);
11766 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 if (length == 1)
11770 return PyBool_FromLong(
11771 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011773 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011775 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 for (i = 0; i < length; i++) {
11778 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011779 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011780 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011782 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783}
11784
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011785PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011786 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011787\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011788Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011789and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011790
11791static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011792unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011793{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 Py_ssize_t i, length;
11795 int kind;
11796 void *data;
11797
11798 if (PyUnicode_READY(self) == -1)
11799 return NULL;
11800 length = PyUnicode_GET_LENGTH(self);
11801 kind = PyUnicode_KIND(self);
11802 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011803
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011804 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 if (length == 1)
11806 return PyBool_FromLong(
11807 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011808
11809 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011811 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 for (i = 0; i < length; i++) {
11814 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011815 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011816 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011817 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011818}
11819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011820PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011821 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011822\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011823Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011824and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011825
11826static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011827unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011828{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 int kind;
11830 void *data;
11831 Py_ssize_t len, i;
11832
11833 if (PyUnicode_READY(self) == -1)
11834 return NULL;
11835
11836 kind = PyUnicode_KIND(self);
11837 data = PyUnicode_DATA(self);
11838 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011839
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011840 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 if (len == 1) {
11842 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11843 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11844 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011845
11846 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 for (i = 0; i < len; i++) {
11851 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011852 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011853 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011854 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011855 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011856}
11857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011858PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011859 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011861Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011862False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863
11864static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011865unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867 Py_ssize_t i, length;
11868 int kind;
11869 void *data;
11870
11871 if (PyUnicode_READY(self) == -1)
11872 return NULL;
11873 length = PyUnicode_GET_LENGTH(self);
11874 kind = PyUnicode_KIND(self);
11875 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 if (length == 1)
11879 return PyBool_FromLong(
11880 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011882 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011884 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011885
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 for (i = 0; i < length; i++) {
11887 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011888 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011890 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891}
11892
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011893PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011894 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011896Return True if all characters in S are digits\n\
11897and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898
11899static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011900unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 Py_ssize_t i, length;
11903 int kind;
11904 void *data;
11905
11906 if (PyUnicode_READY(self) == -1)
11907 return NULL;
11908 length = PyUnicode_GET_LENGTH(self);
11909 kind = PyUnicode_KIND(self);
11910 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 if (length == 1) {
11914 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11915 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11916 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011918 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011920 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 for (i = 0; i < length; i++) {
11923 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011924 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011926 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927}
11928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011929PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011930 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011932Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011933False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934
11935static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011936unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 Py_ssize_t i, length;
11939 int kind;
11940 void *data;
11941
11942 if (PyUnicode_READY(self) == -1)
11943 return NULL;
11944 length = PyUnicode_GET_LENGTH(self);
11945 kind = PyUnicode_KIND(self);
11946 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011949 if (length == 1)
11950 return PyBool_FromLong(
11951 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011953 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011955 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 for (i = 0; i < length; i++) {
11958 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011959 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011961 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962}
11963
Martin v. Löwis47383402007-08-15 07:32:56 +000011964int
11965PyUnicode_IsIdentifier(PyObject *self)
11966{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 int kind;
11968 void *data;
11969 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011970 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 if (PyUnicode_READY(self) == -1) {
11973 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011974 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 }
11976
11977 /* Special case for empty strings */
11978 if (PyUnicode_GET_LENGTH(self) == 0)
11979 return 0;
11980 kind = PyUnicode_KIND(self);
11981 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011982
11983 /* PEP 3131 says that the first character must be in
11984 XID_Start and subsequent characters in XID_Continue,
11985 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011986 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011987 letters, digits, underscore). However, given the current
11988 definition of XID_Start and XID_Continue, it is sufficient
11989 to check just for these, except that _ must be allowed
11990 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011992 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011993 return 0;
11994
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011995 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011998 return 1;
11999}
12000
12001PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012002 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000012003\n\
12004Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012005to the language definition.\n\
12006\n\
12007Use keyword.iskeyword() to test for reserved identifiers\n\
12008such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012009
12010static PyObject*
12011unicode_isidentifier(PyObject *self)
12012{
12013 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12014}
12015
Georg Brandl559e5d72008-06-11 18:37:52 +000012016PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012017 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012018\n\
12019Return True if all characters in S are considered\n\
12020printable in repr() or S is empty, False otherwise.");
12021
12022static PyObject*
12023unicode_isprintable(PyObject *self)
12024{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025 Py_ssize_t i, length;
12026 int kind;
12027 void *data;
12028
12029 if (PyUnicode_READY(self) == -1)
12030 return NULL;
12031 length = PyUnicode_GET_LENGTH(self);
12032 kind = PyUnicode_KIND(self);
12033 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012034
12035 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 if (length == 1)
12037 return PyBool_FromLong(
12038 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 for (i = 0; i < length; i++) {
12041 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012042 Py_RETURN_FALSE;
12043 }
12044 }
12045 Py_RETURN_TRUE;
12046}
12047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012048PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012049 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050\n\
12051Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012052iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053
12054static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012055unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012057 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058}
12059
Martin v. Löwis18e16552006-02-15 17:27:45 +000012060static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012061unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 if (PyUnicode_READY(self) == -1)
12064 return -1;
12065 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066}
12067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012068PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012069 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012071Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012072done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073
12074static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012075unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012077 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 Py_UCS4 fillchar = ' ';
12079
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012080 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081 return NULL;
12082
Benjamin Petersonbac79492012-01-14 13:34:47 -050012083 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085
Victor Stinnerc4b49542011-12-11 22:44:26 +010012086 if (PyUnicode_GET_LENGTH(self) >= width)
12087 return unicode_result_unchanged(self);
12088
12089 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090}
12091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012092PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012093 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012095Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096
12097static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012098unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012100 if (PyUnicode_READY(self) == -1)
12101 return NULL;
12102 if (PyUnicode_IS_ASCII(self))
12103 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012104 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105}
12106
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012107#define LEFTSTRIP 0
12108#define RIGHTSTRIP 1
12109#define BOTHSTRIP 2
12110
12111/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012112static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012113
12114#define STRIPNAME(i) (stripformat[i]+3)
12115
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012116/* externally visible for str.strip(unicode) */
12117PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012118_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012119{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012120 void *data;
12121 int kind;
12122 Py_ssize_t i, j, len;
12123 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012124 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12127 return NULL;
12128
12129 kind = PyUnicode_KIND(self);
12130 data = PyUnicode_DATA(self);
12131 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012132 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12134 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012135 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012136
Benjamin Peterson14339b62009-01-31 16:36:08 +000012137 i = 0;
12138 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012139 while (i < len) {
12140 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12141 if (!BLOOM(sepmask, ch))
12142 break;
12143 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12144 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012145 i++;
12146 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012147 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012148
Benjamin Peterson14339b62009-01-31 16:36:08 +000012149 j = len;
12150 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012151 j--;
12152 while (j >= i) {
12153 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12154 if (!BLOOM(sepmask, ch))
12155 break;
12156 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12157 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012158 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012159 }
12160
Benjamin Peterson29060642009-01-31 22:14:21 +000012161 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012162 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012163
Victor Stinner7931d9a2011-11-04 00:22:48 +010012164 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165}
12166
12167PyObject*
12168PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12169{
12170 unsigned char *data;
12171 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012172 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173
Victor Stinnerde636f32011-10-01 03:55:54 +020012174 if (PyUnicode_READY(self) == -1)
12175 return NULL;
12176
Victor Stinner684d5fd2012-05-03 02:32:34 +020012177 length = PyUnicode_GET_LENGTH(self);
12178 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012179
Victor Stinner684d5fd2012-05-03 02:32:34 +020012180 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012181 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182
Victor Stinnerde636f32011-10-01 03:55:54 +020012183 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012184 PyErr_SetString(PyExc_IndexError, "string index out of range");
12185 return NULL;
12186 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012187 if (start >= length || end < start)
12188 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012189
Victor Stinner684d5fd2012-05-03 02:32:34 +020012190 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012191 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012192 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012193 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012194 }
12195 else {
12196 kind = PyUnicode_KIND(self);
12197 data = PyUnicode_1BYTE_DATA(self);
12198 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012199 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012200 length);
12201 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203
12204static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012205do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 Py_ssize_t len, i, j;
12208
12209 if (PyUnicode_READY(self) == -1)
12210 return NULL;
12211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012213
Victor Stinnercc7af722013-04-09 22:39:24 +020012214 if (PyUnicode_IS_ASCII(self)) {
12215 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12216
12217 i = 0;
12218 if (striptype != RIGHTSTRIP) {
12219 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012220 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012221 if (!_Py_ascii_whitespace[ch])
12222 break;
12223 i++;
12224 }
12225 }
12226
12227 j = len;
12228 if (striptype != LEFTSTRIP) {
12229 j--;
12230 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012231 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012232 if (!_Py_ascii_whitespace[ch])
12233 break;
12234 j--;
12235 }
12236 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012237 }
12238 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012239 else {
12240 int kind = PyUnicode_KIND(self);
12241 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012242
Victor Stinnercc7af722013-04-09 22:39:24 +020012243 i = 0;
12244 if (striptype != RIGHTSTRIP) {
12245 while (i < len) {
12246 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12247 if (!Py_UNICODE_ISSPACE(ch))
12248 break;
12249 i++;
12250 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012251 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012252
12253 j = len;
12254 if (striptype != LEFTSTRIP) {
12255 j--;
12256 while (j >= i) {
12257 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12258 if (!Py_UNICODE_ISSPACE(ch))
12259 break;
12260 j--;
12261 }
12262 j++;
12263 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012264 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012265
Victor Stinner7931d9a2011-11-04 00:22:48 +010012266 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267}
12268
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012269
12270static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012271do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012272{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012273 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012274
Serhiy Storchakac6792272013-10-19 21:03:34 +030012275 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012276 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012277
Benjamin Peterson14339b62009-01-31 16:36:08 +000012278 if (sep != NULL && sep != Py_None) {
12279 if (PyUnicode_Check(sep))
12280 return _PyUnicode_XStrip(self, striptype, sep);
12281 else {
12282 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012283 "%s arg must be None or str",
12284 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012285 return NULL;
12286 }
12287 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012288
Benjamin Peterson14339b62009-01-31 16:36:08 +000012289 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012290}
12291
12292
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012293PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012294 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012295\n\
12296Return a copy of the string S with leading and trailing\n\
12297whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012298If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012299
12300static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012301unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012302{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012303 if (PyTuple_GET_SIZE(args) == 0)
12304 return do_strip(self, BOTHSTRIP); /* Common case */
12305 else
12306 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012307}
12308
12309
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012310PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012311 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012312\n\
12313Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012314If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012315
12316static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012317unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012318{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012319 if (PyTuple_GET_SIZE(args) == 0)
12320 return do_strip(self, LEFTSTRIP); /* Common case */
12321 else
12322 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012323}
12324
12325
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012326PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012327 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012328\n\
12329Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012330If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012331
12332static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012333unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012334{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012335 if (PyTuple_GET_SIZE(args) == 0)
12336 return do_strip(self, RIGHTSTRIP); /* Common case */
12337 else
12338 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012339}
12340
12341
Guido van Rossumd57fd912000-03-10 22:53:23 +000012342static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012343unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012344{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012345 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347
Serhiy Storchaka05997252013-01-26 12:14:02 +020012348 if (len < 1)
12349 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350
Victor Stinnerc4b49542011-12-11 22:44:26 +010012351 /* no repeat, return original string */
12352 if (len == 1)
12353 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012354
Benjamin Petersonbac79492012-01-14 13:34:47 -050012355 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356 return NULL;
12357
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012358 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012359 PyErr_SetString(PyExc_OverflowError,
12360 "repeated string is too long");
12361 return NULL;
12362 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012364
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012365 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366 if (!u)
12367 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012368 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 if (PyUnicode_GET_LENGTH(str) == 1) {
12371 const int kind = PyUnicode_KIND(str);
12372 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012373 if (kind == PyUnicode_1BYTE_KIND) {
12374 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012375 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012376 }
12377 else if (kind == PyUnicode_2BYTE_KIND) {
12378 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012379 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012380 ucs2[n] = fill_char;
12381 } else {
12382 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12383 assert(kind == PyUnicode_4BYTE_KIND);
12384 for (n = 0; n < len; ++n)
12385 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 }
12388 else {
12389 /* number of characters copied this far */
12390 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012391 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012392 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012393 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012394 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012395 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012396 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012397 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012398 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012399 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400 }
12401
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012402 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012403 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012404}
12405
Alexander Belopolsky40018472011-02-26 01:02:56 +000012406PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012407PyUnicode_Replace(PyObject *str,
12408 PyObject *substr,
12409 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012410 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012412 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12413 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012414 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012415 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416}
12417
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012418PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012419 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012420\n\
12421Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012422old replaced by new. If the optional argument count is\n\
12423given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012424
12425static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012427{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 PyObject *str1;
12429 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012430 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012431
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012432 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012433 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012434 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012435 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012436 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012437}
12438
Alexander Belopolsky40018472011-02-26 01:02:56 +000012439static PyObject *
12440unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012441{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012442 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 Py_ssize_t isize;
12444 Py_ssize_t osize, squote, dquote, i, o;
12445 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012446 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012449 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012450 return NULL;
12451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012452 isize = PyUnicode_GET_LENGTH(unicode);
12453 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012455 /* Compute length of output, quote characters, and
12456 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012457 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 max = 127;
12459 squote = dquote = 0;
12460 ikind = PyUnicode_KIND(unicode);
12461 for (i = 0; i < isize; i++) {
12462 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012463 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012465 case '\'': squote++; break;
12466 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012468 incr = 2;
12469 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012470 default:
12471 /* Fast-path ASCII */
12472 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012473 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012474 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012475 ;
12476 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012478 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012479 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012481 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012483 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012484 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012485 if (osize > PY_SSIZE_T_MAX - incr) {
12486 PyErr_SetString(PyExc_OverflowError,
12487 "string is too long to generate repr");
12488 return NULL;
12489 }
12490 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012491 }
12492
12493 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012494 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012495 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012496 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012497 if (dquote)
12498 /* Both squote and dquote present. Use squote,
12499 and escape them */
12500 osize += squote;
12501 else
12502 quote = '"';
12503 }
Victor Stinner55c08782013-04-14 18:45:39 +020012504 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505
12506 repr = PyUnicode_New(osize, max);
12507 if (repr == NULL)
12508 return NULL;
12509 okind = PyUnicode_KIND(repr);
12510 odata = PyUnicode_DATA(repr);
12511
12512 PyUnicode_WRITE(okind, odata, 0, quote);
12513 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012514 if (unchanged) {
12515 _PyUnicode_FastCopyCharacters(repr, 1,
12516 unicode, 0,
12517 isize);
12518 }
12519 else {
12520 for (i = 0, o = 1; i < isize; i++) {
12521 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012522
Victor Stinner55c08782013-04-14 18:45:39 +020012523 /* Escape quotes and backslashes */
12524 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012525 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012527 continue;
12528 }
12529
12530 /* Map special whitespace to '\t', \n', '\r' */
12531 if (ch == '\t') {
12532 PyUnicode_WRITE(okind, odata, o++, '\\');
12533 PyUnicode_WRITE(okind, odata, o++, 't');
12534 }
12535 else if (ch == '\n') {
12536 PyUnicode_WRITE(okind, odata, o++, '\\');
12537 PyUnicode_WRITE(okind, odata, o++, 'n');
12538 }
12539 else if (ch == '\r') {
12540 PyUnicode_WRITE(okind, odata, o++, '\\');
12541 PyUnicode_WRITE(okind, odata, o++, 'r');
12542 }
12543
12544 /* Map non-printable US ASCII to '\xhh' */
12545 else if (ch < ' ' || ch == 0x7F) {
12546 PyUnicode_WRITE(okind, odata, o++, '\\');
12547 PyUnicode_WRITE(okind, odata, o++, 'x');
12548 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12549 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12550 }
12551
12552 /* Copy ASCII characters as-is */
12553 else if (ch < 0x7F) {
12554 PyUnicode_WRITE(okind, odata, o++, ch);
12555 }
12556
12557 /* Non-ASCII characters */
12558 else {
12559 /* Map Unicode whitespace and control characters
12560 (categories Z* and C* except ASCII space)
12561 */
12562 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12563 PyUnicode_WRITE(okind, odata, o++, '\\');
12564 /* Map 8-bit characters to '\xhh' */
12565 if (ch <= 0xff) {
12566 PyUnicode_WRITE(okind, odata, o++, 'x');
12567 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12568 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12569 }
12570 /* Map 16-bit characters to '\uxxxx' */
12571 else if (ch <= 0xffff) {
12572 PyUnicode_WRITE(okind, odata, o++, 'u');
12573 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12574 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12575 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12576 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12577 }
12578 /* Map 21-bit characters to '\U00xxxxxx' */
12579 else {
12580 PyUnicode_WRITE(okind, odata, o++, 'U');
12581 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12582 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12583 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12584 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12585 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12586 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12587 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12588 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12589 }
12590 }
12591 /* Copy characters as-is */
12592 else {
12593 PyUnicode_WRITE(okind, odata, o++, ch);
12594 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012595 }
12596 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012597 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012598 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012599 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012600 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012601}
12602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012603PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012604 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605\n\
12606Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012607such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012608arguments start and end are interpreted as in slice notation.\n\
12609\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012610Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611
12612static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012615 /* initialize variables to prevent gcc warning */
12616 PyObject *substring = NULL;
12617 Py_ssize_t start = 0;
12618 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012619 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012621 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012622 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012624 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012626
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012627 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 if (result == -2)
12630 return NULL;
12631
Christian Heimes217cfd12007-12-02 14:31:20 +000012632 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633}
12634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012635PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012636 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012638Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639
12640static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012643 /* initialize variables to prevent gcc warning */
12644 PyObject *substring = NULL;
12645 Py_ssize_t start = 0;
12646 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012647 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012649 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012652 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012655 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 if (result == -2)
12658 return NULL;
12659
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660 if (result < 0) {
12661 PyErr_SetString(PyExc_ValueError, "substring not found");
12662 return NULL;
12663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664
Christian Heimes217cfd12007-12-02 14:31:20 +000012665 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666}
12667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012668PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012669 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012671Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012672done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673
12674static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012675unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012676{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012677 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678 Py_UCS4 fillchar = ' ';
12679
Victor Stinnere9a29352011-10-01 02:14:59 +020012680 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012682
Benjamin Petersonbac79492012-01-14 13:34:47 -050012683 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684 return NULL;
12685
Victor Stinnerc4b49542011-12-11 22:44:26 +010012686 if (PyUnicode_GET_LENGTH(self) >= width)
12687 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688
Victor Stinnerc4b49542011-12-11 22:44:26 +010012689 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690}
12691
Alexander Belopolsky40018472011-02-26 01:02:56 +000012692PyObject *
12693PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012695 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012696 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012698 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699}
12700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012701PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012702 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703\n\
12704Return a list of the words in S, using sep as the\n\
12705delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012706splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012707whitespace string is a separator and empty strings are\n\
12708removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709
12710static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012711unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012712{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012713 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012715 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012716
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012717 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12718 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719 return NULL;
12720
12721 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012722 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012723
12724 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012725 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012726
12727 PyErr_Format(PyExc_TypeError,
12728 "must be str or None, not %.100s",
12729 Py_TYPE(substring)->tp_name);
12730 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012731}
12732
Thomas Wouters477c8d52006-05-27 19:21:47 +000012733PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012734PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012735{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012736 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012737 int kind1, kind2;
12738 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012740
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012741 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012742 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012743
Victor Stinner14f8f022011-10-05 20:58:25 +020012744 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012746 len1 = PyUnicode_GET_LENGTH(str_obj);
12747 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012748 if (kind1 < kind2 || len1 < len2) {
12749 _Py_INCREF_UNICODE_EMPTY();
12750 if (!unicode_empty)
12751 out = NULL;
12752 else {
12753 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12754 Py_DECREF(unicode_empty);
12755 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012756 return out;
12757 }
12758 buf1 = PyUnicode_DATA(str_obj);
12759 buf2 = PyUnicode_DATA(sep_obj);
12760 if (kind2 != kind1) {
12761 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12762 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012763 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012764 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012765
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012766 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012767 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012768 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12769 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12770 else
12771 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772 break;
12773 case PyUnicode_2BYTE_KIND:
12774 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12775 break;
12776 case PyUnicode_4BYTE_KIND:
12777 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12778 break;
12779 default:
12780 assert(0);
12781 out = 0;
12782 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012783
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012784 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012786
12787 return out;
12788}
12789
12790
12791PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012792PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012793{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012794 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012795 int kind1, kind2;
12796 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012797 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012798
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012799 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012800 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012801
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012802 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012803 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804 len1 = PyUnicode_GET_LENGTH(str_obj);
12805 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012806 if (kind1 < kind2 || len1 < len2) {
12807 _Py_INCREF_UNICODE_EMPTY();
12808 if (!unicode_empty)
12809 out = NULL;
12810 else {
12811 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12812 Py_DECREF(unicode_empty);
12813 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012814 return out;
12815 }
12816 buf1 = PyUnicode_DATA(str_obj);
12817 buf2 = PyUnicode_DATA(sep_obj);
12818 if (kind2 != kind1) {
12819 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12820 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012821 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012822 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012823
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012824 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012826 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12827 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12828 else
12829 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012830 break;
12831 case PyUnicode_2BYTE_KIND:
12832 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12833 break;
12834 case PyUnicode_4BYTE_KIND:
12835 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12836 break;
12837 default:
12838 assert(0);
12839 out = 0;
12840 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012841
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012842 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012843 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012844
12845 return out;
12846}
12847
12848PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012849 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012850\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012851Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012852the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012853found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012854
12855static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012856unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012857{
Victor Stinner9310abb2011-10-05 00:59:23 +020012858 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012859}
12860
12861PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012862 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012863\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012864Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012865the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012866separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012867
12868static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012869unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012870{
Victor Stinner9310abb2011-10-05 00:59:23 +020012871 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012872}
12873
Alexander Belopolsky40018472011-02-26 01:02:56 +000012874PyObject *
12875PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012876{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012877 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012878 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012879
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012880 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012881}
12882
12883PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012884 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012885\n\
12886Return a list of the words in S, using sep as the\n\
12887delimiter string, starting at the end of the string and\n\
12888working to the front. If maxsplit is given, at most maxsplit\n\
12889splits are done. If sep is not specified, any whitespace string\n\
12890is a separator.");
12891
12892static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012893unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012894{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012895 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012896 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012897 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012898
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012899 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12900 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012901 return NULL;
12902
12903 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012904 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012905
12906 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012907 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012908
12909 PyErr_Format(PyExc_TypeError,
12910 "must be str or None, not %.100s",
12911 Py_TYPE(substring)->tp_name);
12912 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012913}
12914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012915PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012916 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012917\n\
12918Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012919Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012920is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012921
12922static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012923unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012924{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012925 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012926 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012927
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012928 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12929 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012930 return NULL;
12931
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012932 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012933}
12934
12935static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012936PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012937{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012938 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012939}
12940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012941PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012942 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012943\n\
12944Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012945and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012946
12947static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012948unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012950 if (PyUnicode_READY(self) == -1)
12951 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012952 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012953}
12954
Larry Hastings61272b72014-01-07 12:41:53 -080012955/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012956
Larry Hastings31826802013-10-19 00:09:25 -070012957@staticmethod
12958str.maketrans as unicode_maketrans
12959
12960 x: object
12961
12962 y: unicode=NULL
12963
12964 z: unicode=NULL
12965
12966 /
12967
12968Return a translation table usable for str.translate().
12969
12970If there is only one argument, it must be a dictionary mapping Unicode
12971ordinals (integers) or characters to Unicode ordinals, strings or None.
12972Character keys will be then converted to ordinals.
12973If there are two arguments, they must be strings of equal length, and
12974in the resulting dictionary, each character in x will be mapped to the
12975character at the same position in y. If there is a third argument, it
12976must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012977[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012978
Larry Hastings31826802013-10-19 00:09:25 -070012979static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012980unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030012981/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012982{
Georg Brandlceee0772007-11-27 23:48:05 +000012983 PyObject *new = NULL, *key, *value;
12984 Py_ssize_t i = 0;
12985 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012986
Georg Brandlceee0772007-11-27 23:48:05 +000012987 new = PyDict_New();
12988 if (!new)
12989 return NULL;
12990 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 int x_kind, y_kind, z_kind;
12992 void *x_data, *y_data, *z_data;
12993
Georg Brandlceee0772007-11-27 23:48:05 +000012994 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012995 if (!PyUnicode_Check(x)) {
12996 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12997 "be a string if there is a second argument");
12998 goto err;
12999 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013001 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13002 "arguments must have equal length");
13003 goto err;
13004 }
13005 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013006 x_kind = PyUnicode_KIND(x);
13007 y_kind = PyUnicode_KIND(y);
13008 x_data = PyUnicode_DATA(x);
13009 y_data = PyUnicode_DATA(y);
13010 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13011 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013012 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013013 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013014 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013015 if (!value) {
13016 Py_DECREF(key);
13017 goto err;
13018 }
Georg Brandlceee0772007-11-27 23:48:05 +000013019 res = PyDict_SetItem(new, key, value);
13020 Py_DECREF(key);
13021 Py_DECREF(value);
13022 if (res < 0)
13023 goto err;
13024 }
13025 /* create entries for deleting chars in z */
13026 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013027 z_kind = PyUnicode_KIND(z);
13028 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013029 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013030 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013031 if (!key)
13032 goto err;
13033 res = PyDict_SetItem(new, key, Py_None);
13034 Py_DECREF(key);
13035 if (res < 0)
13036 goto err;
13037 }
13038 }
13039 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013040 int kind;
13041 void *data;
13042
Georg Brandlceee0772007-11-27 23:48:05 +000013043 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013044 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013045 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13046 "to maketrans it must be a dict");
13047 goto err;
13048 }
13049 /* copy entries into the new dict, converting string keys to int keys */
13050 while (PyDict_Next(x, &i, &key, &value)) {
13051 if (PyUnicode_Check(key)) {
13052 /* convert string keys to integer keys */
13053 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013054 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013055 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13056 "table must be of length 1");
13057 goto err;
13058 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013059 kind = PyUnicode_KIND(key);
13060 data = PyUnicode_DATA(key);
13061 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013062 if (!newkey)
13063 goto err;
13064 res = PyDict_SetItem(new, newkey, value);
13065 Py_DECREF(newkey);
13066 if (res < 0)
13067 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013068 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013069 /* just keep integer keys */
13070 if (PyDict_SetItem(new, key, value) < 0)
13071 goto err;
13072 } else {
13073 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13074 "be strings or integers");
13075 goto err;
13076 }
13077 }
13078 }
13079 return new;
13080 err:
13081 Py_DECREF(new);
13082 return NULL;
13083}
13084
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013085PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013086 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013087\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013088Return a copy of the string S in which each character has been mapped\n\
13089through the given translation table. The table must implement\n\
13090lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13091mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13092this operation raises LookupError, the character is left untouched.\n\
13093Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094
13095static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013096unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013098 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013099}
13100
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013101PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013102 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013104Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105
13106static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013107unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013109 if (PyUnicode_READY(self) == -1)
13110 return NULL;
13111 if (PyUnicode_IS_ASCII(self))
13112 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013113 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114}
13115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013116PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013117 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013119Pad a numeric string S with zeros on the left, to fill a field\n\
13120of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121
13122static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013123unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013125 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013126 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013127 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013128 int kind;
13129 void *data;
13130 Py_UCS4 chr;
13131
Martin v. Löwis18e16552006-02-15 17:27:45 +000013132 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133 return NULL;
13134
Benjamin Petersonbac79492012-01-14 13:34:47 -050013135 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013136 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013137
Victor Stinnerc4b49542011-12-11 22:44:26 +010013138 if (PyUnicode_GET_LENGTH(self) >= width)
13139 return unicode_result_unchanged(self);
13140
13141 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142
13143 u = pad(self, fill, 0, '0');
13144
Walter Dörwald068325e2002-04-15 13:36:47 +000013145 if (u == NULL)
13146 return NULL;
13147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013148 kind = PyUnicode_KIND(u);
13149 data = PyUnicode_DATA(u);
13150 chr = PyUnicode_READ(kind, data, fill);
13151
13152 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013153 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013154 PyUnicode_WRITE(kind, data, 0, chr);
13155 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013156 }
13157
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013158 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013159 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013160}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013161
13162#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013163static PyObject *
13164unicode__decimal2ascii(PyObject *self)
13165{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013166 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013167}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013168#endif
13169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013170PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013171 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013172\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013173Return True if S starts with the specified prefix, False otherwise.\n\
13174With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013175With optional end, stop comparing S at that position.\n\
13176prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013177
13178static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013179unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013180 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013181{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013182 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013183 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013184 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013185 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013186 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013187
Jesus Ceaac451502011-04-20 17:09:23 +020013188 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013189 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013190 if (PyTuple_Check(subobj)) {
13191 Py_ssize_t i;
13192 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013193 substring = PyTuple_GET_ITEM(subobj, i);
13194 if (!PyUnicode_Check(substring)) {
13195 PyErr_Format(PyExc_TypeError,
13196 "tuple for startswith must only contain str, "
13197 "not %.100s",
13198 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013199 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013200 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013201 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013202 if (result == -1)
13203 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013204 if (result) {
13205 Py_RETURN_TRUE;
13206 }
13207 }
13208 /* nothing matched */
13209 Py_RETURN_FALSE;
13210 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013211 if (!PyUnicode_Check(subobj)) {
13212 PyErr_Format(PyExc_TypeError,
13213 "startswith first arg must be str or "
13214 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013215 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013216 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013217 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013218 if (result == -1)
13219 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013220 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013221}
13222
13223
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013224PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013225 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013226\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013227Return True if S ends with the specified suffix, False otherwise.\n\
13228With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013229With optional end, stop comparing S at that position.\n\
13230suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013231
13232static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013233unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013234 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013235{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013236 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013237 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013238 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013239 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013240 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013241
Jesus Ceaac451502011-04-20 17:09:23 +020013242 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013243 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013244 if (PyTuple_Check(subobj)) {
13245 Py_ssize_t i;
13246 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013247 substring = PyTuple_GET_ITEM(subobj, i);
13248 if (!PyUnicode_Check(substring)) {
13249 PyErr_Format(PyExc_TypeError,
13250 "tuple for endswith must only contain str, "
13251 "not %.100s",
13252 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013253 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013254 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013255 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013256 if (result == -1)
13257 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013258 if (result) {
13259 Py_RETURN_TRUE;
13260 }
13261 }
13262 Py_RETURN_FALSE;
13263 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013264 if (!PyUnicode_Check(subobj)) {
13265 PyErr_Format(PyExc_TypeError,
13266 "endswith first arg must be str or "
13267 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013268 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013269 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013270 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013271 if (result == -1)
13272 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013273 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274}
13275
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013276static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013277_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013278{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013279 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13280 writer->data = PyUnicode_DATA(writer->buffer);
13281
13282 if (!writer->readonly) {
13283 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013284 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013285 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013286 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013287 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13288 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13289 writer->kind = PyUnicode_WCHAR_KIND;
13290 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13291
Victor Stinner8f674cc2013-04-17 23:02:17 +020013292 /* Copy-on-write mode: set buffer size to 0 so
13293 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13294 * next write. */
13295 writer->size = 0;
13296 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013297}
13298
Victor Stinnerd3f08822012-05-29 12:57:52 +020013299void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013300_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013301{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013302 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013303
13304 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013305 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013306
13307 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13308 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13309 writer->kind = PyUnicode_WCHAR_KIND;
13310 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013311}
13312
Victor Stinnerd3f08822012-05-29 12:57:52 +020013313int
13314_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13315 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013316{
13317 Py_ssize_t newlen;
13318 PyObject *newbuffer;
13319
Victor Stinner2740e462016-09-06 16:58:36 -070013320 assert(maxchar <= MAX_UNICODE);
13321
Victor Stinnerca9381e2015-09-22 00:58:32 +020013322 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013323 assert((maxchar > writer->maxchar && length >= 0)
13324 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013325
Victor Stinner202fdca2012-05-07 12:47:02 +020013326 if (length > PY_SSIZE_T_MAX - writer->pos) {
13327 PyErr_NoMemory();
13328 return -1;
13329 }
13330 newlen = writer->pos + length;
13331
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013332 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013333
Victor Stinnerd3f08822012-05-29 12:57:52 +020013334 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013335 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013336 if (writer->overallocate
13337 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13338 /* overallocate to limit the number of realloc() */
13339 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013340 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013341 if (newlen < writer->min_length)
13342 newlen = writer->min_length;
13343
Victor Stinnerd3f08822012-05-29 12:57:52 +020013344 writer->buffer = PyUnicode_New(newlen, maxchar);
13345 if (writer->buffer == NULL)
13346 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013347 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013348 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013349 if (writer->overallocate
13350 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13351 /* overallocate to limit the number of realloc() */
13352 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013353 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013354 if (newlen < writer->min_length)
13355 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013356
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013357 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013358 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013359 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013360 newbuffer = PyUnicode_New(newlen, maxchar);
13361 if (newbuffer == NULL)
13362 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013363 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13364 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013365 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013366 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013367 }
13368 else {
13369 newbuffer = resize_compact(writer->buffer, newlen);
13370 if (newbuffer == NULL)
13371 return -1;
13372 }
13373 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013374 }
13375 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013376 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013377 newbuffer = PyUnicode_New(writer->size, maxchar);
13378 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013379 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013380 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13381 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013382 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013383 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013384 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013385 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013386
13387#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013388}
13389
Victor Stinnerca9381e2015-09-22 00:58:32 +020013390int
13391_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13392 enum PyUnicode_Kind kind)
13393{
13394 Py_UCS4 maxchar;
13395
13396 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13397 assert(writer->kind < kind);
13398
13399 switch (kind)
13400 {
13401 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13402 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13403 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13404 default:
13405 assert(0 && "invalid kind");
13406 return -1;
13407 }
13408
13409 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13410}
13411
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013412static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013413_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013414{
Victor Stinner2740e462016-09-06 16:58:36 -070013415 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013416 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13417 return -1;
13418 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13419 writer->pos++;
13420 return 0;
13421}
13422
13423int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013424_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13425{
13426 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13427}
13428
13429int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013430_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13431{
13432 Py_UCS4 maxchar;
13433 Py_ssize_t len;
13434
13435 if (PyUnicode_READY(str) == -1)
13436 return -1;
13437 len = PyUnicode_GET_LENGTH(str);
13438 if (len == 0)
13439 return 0;
13440 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13441 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013442 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013443 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013444 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013445 Py_INCREF(str);
13446 writer->buffer = str;
13447 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013448 writer->pos += len;
13449 return 0;
13450 }
13451 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13452 return -1;
13453 }
13454 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13455 str, 0, len);
13456 writer->pos += len;
13457 return 0;
13458}
13459
Victor Stinnere215d962012-10-06 23:03:36 +020013460int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013461_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13462 Py_ssize_t start, Py_ssize_t end)
13463{
13464 Py_UCS4 maxchar;
13465 Py_ssize_t len;
13466
13467 if (PyUnicode_READY(str) == -1)
13468 return -1;
13469
13470 assert(0 <= start);
13471 assert(end <= PyUnicode_GET_LENGTH(str));
13472 assert(start <= end);
13473
13474 if (end == 0)
13475 return 0;
13476
13477 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13478 return _PyUnicodeWriter_WriteStr(writer, str);
13479
13480 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13481 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13482 else
13483 maxchar = writer->maxchar;
13484 len = end - start;
13485
13486 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13487 return -1;
13488
13489 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13490 str, start, len);
13491 writer->pos += len;
13492 return 0;
13493}
13494
13495int
Victor Stinner4a587072013-11-19 12:54:53 +010013496_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13497 const char *ascii, Py_ssize_t len)
13498{
13499 if (len == -1)
13500 len = strlen(ascii);
13501
13502 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13503
13504 if (writer->buffer == NULL && !writer->overallocate) {
13505 PyObject *str;
13506
13507 str = _PyUnicode_FromASCII(ascii, len);
13508 if (str == NULL)
13509 return -1;
13510
13511 writer->readonly = 1;
13512 writer->buffer = str;
13513 _PyUnicodeWriter_Update(writer);
13514 writer->pos += len;
13515 return 0;
13516 }
13517
13518 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13519 return -1;
13520
13521 switch (writer->kind)
13522 {
13523 case PyUnicode_1BYTE_KIND:
13524 {
13525 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13526 Py_UCS1 *data = writer->data;
13527
Christian Heimesf051e432016-09-13 20:22:02 +020013528 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013529 break;
13530 }
13531 case PyUnicode_2BYTE_KIND:
13532 {
13533 _PyUnicode_CONVERT_BYTES(
13534 Py_UCS1, Py_UCS2,
13535 ascii, ascii + len,
13536 (Py_UCS2 *)writer->data + writer->pos);
13537 break;
13538 }
13539 case PyUnicode_4BYTE_KIND:
13540 {
13541 _PyUnicode_CONVERT_BYTES(
13542 Py_UCS1, Py_UCS4,
13543 ascii, ascii + len,
13544 (Py_UCS4 *)writer->data + writer->pos);
13545 break;
13546 }
13547 default:
13548 assert(0);
13549 }
13550
13551 writer->pos += len;
13552 return 0;
13553}
13554
13555int
13556_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13557 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013558{
13559 Py_UCS4 maxchar;
13560
13561 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13562 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13563 return -1;
13564 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13565 writer->pos += len;
13566 return 0;
13567}
13568
Victor Stinnerd3f08822012-05-29 12:57:52 +020013569PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013570_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013571{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013572 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013573
Victor Stinnerd3f08822012-05-29 12:57:52 +020013574 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013575 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013576 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013577 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013578
13579 str = writer->buffer;
13580 writer->buffer = NULL;
13581
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013582 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013583 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13584 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013585 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013586
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013587 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13588 PyObject *str2;
13589 str2 = resize_compact(str, writer->pos);
13590 if (str2 == NULL) {
13591 Py_DECREF(str);
13592 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013593 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013594 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013595 }
13596
Victor Stinner15a0bd32013-07-08 22:29:55 +020013597 assert(_PyUnicode_CheckConsistency(str, 1));
13598 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013599}
13600
Victor Stinnerd3f08822012-05-29 12:57:52 +020013601void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013602_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013603{
13604 Py_CLEAR(writer->buffer);
13605}
13606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013607#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013608
13609PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013610 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013611\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013612Return a formatted version of S, using substitutions from args and kwargs.\n\
13613The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013614
Eric Smith27bbca62010-11-04 17:06:58 +000013615PyDoc_STRVAR(format_map__doc__,
13616 "S.format_map(mapping) -> str\n\
13617\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013618Return a formatted version of S, using substitutions from mapping.\n\
13619The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013620
Eric Smith4a7d76d2008-05-30 18:10:19 +000013621static PyObject *
13622unicode__format__(PyObject* self, PyObject* args)
13623{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013624 PyObject *format_spec;
13625 _PyUnicodeWriter writer;
13626 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013627
13628 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13629 return NULL;
13630
Victor Stinnerd3f08822012-05-29 12:57:52 +020013631 if (PyUnicode_READY(self) == -1)
13632 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013633 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013634 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13635 self, format_spec, 0,
13636 PyUnicode_GET_LENGTH(format_spec));
13637 if (ret == -1) {
13638 _PyUnicodeWriter_Dealloc(&writer);
13639 return NULL;
13640 }
13641 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013642}
13643
Eric Smith8c663262007-08-25 02:26:07 +000013644PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013645 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013646\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013647Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013648
13649static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013650unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013651{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013652 Py_ssize_t size;
13653
13654 /* If it's a compact object, account for base structure +
13655 character data. */
13656 if (PyUnicode_IS_COMPACT_ASCII(v))
13657 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13658 else if (PyUnicode_IS_COMPACT(v))
13659 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013660 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013661 else {
13662 /* If it is a two-block object, account for base object, and
13663 for character block if present. */
13664 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013665 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013666 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013667 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013668 }
13669 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013670 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013671 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013672 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013673 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013674 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013675
13676 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013677}
13678
13679PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013680 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013681
13682static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013683unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013684{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013685 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013686 if (!copy)
13687 return NULL;
13688 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013689}
13690
Guido van Rossumd57fd912000-03-10 22:53:23 +000013691static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013692 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013693 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013694 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13695 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013696 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13697 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013698 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013699 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13700 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13701 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013702 {"expandtabs", (PyCFunction) unicode_expandtabs,
13703 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013704 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013705 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013706 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13707 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13708 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013709 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013710 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13711 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13712 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013713 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013714 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013715 {"splitlines", (PyCFunction) unicode_splitlines,
13716 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013717 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013718 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13719 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13720 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13721 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13722 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13723 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13724 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13725 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13726 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13727 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13728 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13729 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13730 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13731 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013732 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013733 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013734 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013735 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013736 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013737 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013738 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013739 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013740#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013741 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013742 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013743#endif
13744
Benjamin Peterson14339b62009-01-31 16:36:08 +000013745 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013746 {NULL, NULL}
13747};
13748
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013749static PyObject *
13750unicode_mod(PyObject *v, PyObject *w)
13751{
Brian Curtindfc80e32011-08-10 20:28:54 -050013752 if (!PyUnicode_Check(v))
13753 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013754 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013755}
13756
13757static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013758 0, /*nb_add*/
13759 0, /*nb_subtract*/
13760 0, /*nb_multiply*/
13761 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013762};
13763
Guido van Rossumd57fd912000-03-10 22:53:23 +000013764static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013765 (lenfunc) unicode_length, /* sq_length */
13766 PyUnicode_Concat, /* sq_concat */
13767 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13768 (ssizeargfunc) unicode_getitem, /* sq_item */
13769 0, /* sq_slice */
13770 0, /* sq_ass_item */
13771 0, /* sq_ass_slice */
13772 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013773};
13774
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013775static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013776unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013778 if (PyUnicode_READY(self) == -1)
13779 return NULL;
13780
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013781 if (PyIndex_Check(item)) {
13782 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013783 if (i == -1 && PyErr_Occurred())
13784 return NULL;
13785 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013786 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013787 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013788 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013789 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013790 PyObject *result;
13791 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013792 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013793 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013795 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013796 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013797 return NULL;
13798 }
13799
13800 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013801 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013802 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013803 slicelength == PyUnicode_GET_LENGTH(self)) {
13804 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013805 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013806 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013807 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013808 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013809 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013810 src_kind = PyUnicode_KIND(self);
13811 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013812 if (!PyUnicode_IS_ASCII(self)) {
13813 kind_limit = kind_maxchar_limit(src_kind);
13814 max_char = 0;
13815 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13816 ch = PyUnicode_READ(src_kind, src_data, cur);
13817 if (ch > max_char) {
13818 max_char = ch;
13819 if (max_char >= kind_limit)
13820 break;
13821 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013822 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013823 }
Victor Stinner55c99112011-10-13 01:17:06 +020013824 else
13825 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013826 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013827 if (result == NULL)
13828 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013829 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013830 dest_data = PyUnicode_DATA(result);
13831
13832 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013833 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13834 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013835 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013836 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013837 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013838 } else {
13839 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13840 return NULL;
13841 }
13842}
13843
13844static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013845 (lenfunc)unicode_length, /* mp_length */
13846 (binaryfunc)unicode_subscript, /* mp_subscript */
13847 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013848};
13849
Guido van Rossumd57fd912000-03-10 22:53:23 +000013850
Guido van Rossumd57fd912000-03-10 22:53:23 +000013851/* Helpers for PyUnicode_Format() */
13852
Victor Stinnera47082312012-10-04 02:19:54 +020013853struct unicode_formatter_t {
13854 PyObject *args;
13855 int args_owned;
13856 Py_ssize_t arglen, argidx;
13857 PyObject *dict;
13858
13859 enum PyUnicode_Kind fmtkind;
13860 Py_ssize_t fmtcnt, fmtpos;
13861 void *fmtdata;
13862 PyObject *fmtstr;
13863
13864 _PyUnicodeWriter writer;
13865};
13866
13867struct unicode_format_arg_t {
13868 Py_UCS4 ch;
13869 int flags;
13870 Py_ssize_t width;
13871 int prec;
13872 int sign;
13873};
13874
Guido van Rossumd57fd912000-03-10 22:53:23 +000013875static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013876unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013877{
Victor Stinnera47082312012-10-04 02:19:54 +020013878 Py_ssize_t argidx = ctx->argidx;
13879
13880 if (argidx < ctx->arglen) {
13881 ctx->argidx++;
13882 if (ctx->arglen < 0)
13883 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013884 else
Victor Stinnera47082312012-10-04 02:19:54 +020013885 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013886 }
13887 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013888 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013889 return NULL;
13890}
13891
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013892/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013893
Victor Stinnera47082312012-10-04 02:19:54 +020013894/* Format a float into the writer if the writer is not NULL, or into *p_output
13895 otherwise.
13896
13897 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013898static int
Victor Stinnera47082312012-10-04 02:19:54 +020013899formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13900 PyObject **p_output,
13901 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013902{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013903 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013904 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013905 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013906 int prec;
13907 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013908
Guido van Rossumd57fd912000-03-10 22:53:23 +000013909 x = PyFloat_AsDouble(v);
13910 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013911 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013912
Victor Stinnera47082312012-10-04 02:19:54 +020013913 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013914 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013915 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013916
Victor Stinnera47082312012-10-04 02:19:54 +020013917 if (arg->flags & F_ALT)
13918 dtoa_flags = Py_DTSF_ALT;
13919 else
13920 dtoa_flags = 0;
13921 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013922 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013923 return -1;
13924 len = strlen(p);
13925 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013926 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013927 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013928 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013929 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013930 }
13931 else
13932 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013933 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013934 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013935}
13936
Victor Stinnerd0880d52012-04-27 23:40:13 +020013937/* formatlong() emulates the format codes d, u, o, x and X, and
13938 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13939 * Python's regular ints.
13940 * Return value: a new PyUnicodeObject*, or NULL if error.
13941 * The output string is of the form
13942 * "-"? ("0x" | "0X")? digit+
13943 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13944 * set in flags. The case of hex digits will be correct,
13945 * There will be at least prec digits, zero-filled on the left if
13946 * necessary to get that many.
13947 * val object to be converted
13948 * flags bitmask of format flags; only F_ALT is looked at
13949 * prec minimum number of digits; 0-fill on left if needed
13950 * type a character in [duoxX]; u acts the same as d
13951 *
13952 * CAUTION: o, x and X conversions on regular ints can never
13953 * produce a '-' sign, but can for Python's unbounded ints.
13954 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013955PyObject *
13956_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013957{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013958 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013959 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013960 Py_ssize_t i;
13961 int sign; /* 1 if '-', else 0 */
13962 int len; /* number of characters */
13963 Py_ssize_t llen;
13964 int numdigits; /* len == numnondigits + numdigits */
13965 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013966
Victor Stinnerd0880d52012-04-27 23:40:13 +020013967 /* Avoid exceeding SSIZE_T_MAX */
13968 if (prec > INT_MAX-3) {
13969 PyErr_SetString(PyExc_OverflowError,
13970 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013971 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013972 }
13973
13974 assert(PyLong_Check(val));
13975
13976 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013977 default:
13978 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013979 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013980 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013981 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013982 /* int and int subclasses should print numerically when a numeric */
13983 /* format code is used (see issue18780) */
13984 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013985 break;
13986 case 'o':
13987 numnondigits = 2;
13988 result = PyNumber_ToBase(val, 8);
13989 break;
13990 case 'x':
13991 case 'X':
13992 numnondigits = 2;
13993 result = PyNumber_ToBase(val, 16);
13994 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013995 }
13996 if (!result)
13997 return NULL;
13998
13999 assert(unicode_modifiable(result));
14000 assert(PyUnicode_IS_READY(result));
14001 assert(PyUnicode_IS_ASCII(result));
14002
14003 /* To modify the string in-place, there can only be one reference. */
14004 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014005 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014006 PyErr_BadInternalCall();
14007 return NULL;
14008 }
14009 buf = PyUnicode_DATA(result);
14010 llen = PyUnicode_GET_LENGTH(result);
14011 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014012 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014013 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014014 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014015 return NULL;
14016 }
14017 len = (int)llen;
14018 sign = buf[0] == '-';
14019 numnondigits += sign;
14020 numdigits = len - numnondigits;
14021 assert(numdigits > 0);
14022
14023 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014024 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014025 (type == 'o' || type == 'x' || type == 'X'))) {
14026 assert(buf[sign] == '0');
14027 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14028 buf[sign+1] == 'o');
14029 numnondigits -= 2;
14030 buf += 2;
14031 len -= 2;
14032 if (sign)
14033 buf[0] = '-';
14034 assert(len == numnondigits + numdigits);
14035 assert(numdigits > 0);
14036 }
14037
14038 /* Fill with leading zeroes to meet minimum width. */
14039 if (prec > numdigits) {
14040 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14041 numnondigits + prec);
14042 char *b1;
14043 if (!r1) {
14044 Py_DECREF(result);
14045 return NULL;
14046 }
14047 b1 = PyBytes_AS_STRING(r1);
14048 for (i = 0; i < numnondigits; ++i)
14049 *b1++ = *buf++;
14050 for (i = 0; i < prec - numdigits; i++)
14051 *b1++ = '0';
14052 for (i = 0; i < numdigits; i++)
14053 *b1++ = *buf++;
14054 *b1 = '\0';
14055 Py_DECREF(result);
14056 result = r1;
14057 buf = PyBytes_AS_STRING(result);
14058 len = numnondigits + prec;
14059 }
14060
14061 /* Fix up case for hex conversions. */
14062 if (type == 'X') {
14063 /* Need to convert all lower case letters to upper case.
14064 and need to convert 0x to 0X (and -0x to -0X). */
14065 for (i = 0; i < len; i++)
14066 if (buf[i] >= 'a' && buf[i] <= 'x')
14067 buf[i] -= 'a'-'A';
14068 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014069 if (!PyUnicode_Check(result)
14070 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014071 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014072 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014073 Py_DECREF(result);
14074 result = unicode;
14075 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014076 else if (len != PyUnicode_GET_LENGTH(result)) {
14077 if (PyUnicode_Resize(&result, len) < 0)
14078 Py_CLEAR(result);
14079 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014080 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014081}
14082
Ethan Furmandf3ed242014-01-05 06:50:30 -080014083/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014084 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014085 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014086 * -1 and raise an exception on error */
14087static int
Victor Stinnera47082312012-10-04 02:19:54 +020014088mainformatlong(PyObject *v,
14089 struct unicode_format_arg_t *arg,
14090 PyObject **p_output,
14091 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014092{
14093 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014094 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014095
14096 if (!PyNumber_Check(v))
14097 goto wrongtype;
14098
Ethan Furman9ab74802014-03-21 06:38:46 -070014099 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014100 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014101 if (type == 'o' || type == 'x' || type == 'X') {
14102 iobj = PyNumber_Index(v);
14103 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014104 if (PyErr_ExceptionMatches(PyExc_TypeError))
14105 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014106 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014107 }
14108 }
14109 else {
14110 iobj = PyNumber_Long(v);
14111 if (iobj == NULL ) {
14112 if (PyErr_ExceptionMatches(PyExc_TypeError))
14113 goto wrongtype;
14114 return -1;
14115 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014116 }
14117 assert(PyLong_Check(iobj));
14118 }
14119 else {
14120 iobj = v;
14121 Py_INCREF(iobj);
14122 }
14123
14124 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014125 && arg->width == -1 && arg->prec == -1
14126 && !(arg->flags & (F_SIGN | F_BLANK))
14127 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014128 {
14129 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014130 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014131 int base;
14132
Victor Stinnera47082312012-10-04 02:19:54 +020014133 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014134 {
14135 default:
14136 assert(0 && "'type' not in [diuoxX]");
14137 case 'd':
14138 case 'i':
14139 case 'u':
14140 base = 10;
14141 break;
14142 case 'o':
14143 base = 8;
14144 break;
14145 case 'x':
14146 case 'X':
14147 base = 16;
14148 break;
14149 }
14150
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014151 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14152 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014153 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014154 }
14155 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014156 return 1;
14157 }
14158
Ethan Furmanb95b5612015-01-23 20:05:18 -080014159 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014160 Py_DECREF(iobj);
14161 if (res == NULL)
14162 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014163 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014164 return 0;
14165
14166wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014167 switch(type)
14168 {
14169 case 'o':
14170 case 'x':
14171 case 'X':
14172 PyErr_Format(PyExc_TypeError,
14173 "%%%c format: an integer is required, "
14174 "not %.200s",
14175 type, Py_TYPE(v)->tp_name);
14176 break;
14177 default:
14178 PyErr_Format(PyExc_TypeError,
14179 "%%%c format: a number is required, "
14180 "not %.200s",
14181 type, Py_TYPE(v)->tp_name);
14182 break;
14183 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014184 return -1;
14185}
14186
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014187static Py_UCS4
14188formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014189{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014190 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014191 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014192 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014193 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014194 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014195 goto onError;
14196 }
14197 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014198 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014199 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014200 /* make sure number is a type of integer */
14201 if (!PyLong_Check(v)) {
14202 iobj = PyNumber_Index(v);
14203 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014204 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014205 }
14206 v = iobj;
14207 Py_DECREF(iobj);
14208 }
14209 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014210 x = PyLong_AsLong(v);
14211 if (x == -1 && PyErr_Occurred())
14212 goto onError;
14213
Victor Stinner8faf8212011-12-08 22:14:11 +010014214 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014215 PyErr_SetString(PyExc_OverflowError,
14216 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014217 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014218 }
14219
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014220 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014221 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014222
Benjamin Peterson29060642009-01-31 22:14:21 +000014223 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014224 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014225 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014226 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014227}
14228
Victor Stinnera47082312012-10-04 02:19:54 +020014229/* Parse options of an argument: flags, width, precision.
14230 Handle also "%(name)" syntax.
14231
14232 Return 0 if the argument has been formatted into arg->str.
14233 Return 1 if the argument has been written into ctx->writer,
14234 Raise an exception and return -1 on error. */
14235static int
14236unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14237 struct unicode_format_arg_t *arg)
14238{
14239#define FORMAT_READ(ctx) \
14240 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14241
14242 PyObject *v;
14243
Victor Stinnera47082312012-10-04 02:19:54 +020014244 if (arg->ch == '(') {
14245 /* Get argument value from a dictionary. Example: "%(name)s". */
14246 Py_ssize_t keystart;
14247 Py_ssize_t keylen;
14248 PyObject *key;
14249 int pcount = 1;
14250
14251 if (ctx->dict == NULL) {
14252 PyErr_SetString(PyExc_TypeError,
14253 "format requires a mapping");
14254 return -1;
14255 }
14256 ++ctx->fmtpos;
14257 --ctx->fmtcnt;
14258 keystart = ctx->fmtpos;
14259 /* Skip over balanced parentheses */
14260 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14261 arg->ch = FORMAT_READ(ctx);
14262 if (arg->ch == ')')
14263 --pcount;
14264 else if (arg->ch == '(')
14265 ++pcount;
14266 ctx->fmtpos++;
14267 }
14268 keylen = ctx->fmtpos - keystart - 1;
14269 if (ctx->fmtcnt < 0 || pcount > 0) {
14270 PyErr_SetString(PyExc_ValueError,
14271 "incomplete format key");
14272 return -1;
14273 }
14274 key = PyUnicode_Substring(ctx->fmtstr,
14275 keystart, keystart + keylen);
14276 if (key == NULL)
14277 return -1;
14278 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014279 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014280 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014281 }
14282 ctx->args = PyObject_GetItem(ctx->dict, key);
14283 Py_DECREF(key);
14284 if (ctx->args == NULL)
14285 return -1;
14286 ctx->args_owned = 1;
14287 ctx->arglen = -1;
14288 ctx->argidx = -2;
14289 }
14290
14291 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014292 while (--ctx->fmtcnt >= 0) {
14293 arg->ch = FORMAT_READ(ctx);
14294 ctx->fmtpos++;
14295 switch (arg->ch) {
14296 case '-': arg->flags |= F_LJUST; continue;
14297 case '+': arg->flags |= F_SIGN; continue;
14298 case ' ': arg->flags |= F_BLANK; continue;
14299 case '#': arg->flags |= F_ALT; continue;
14300 case '0': arg->flags |= F_ZERO; continue;
14301 }
14302 break;
14303 }
14304
14305 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014306 if (arg->ch == '*') {
14307 v = unicode_format_getnextarg(ctx);
14308 if (v == NULL)
14309 return -1;
14310 if (!PyLong_Check(v)) {
14311 PyErr_SetString(PyExc_TypeError,
14312 "* wants int");
14313 return -1;
14314 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014315 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014316 if (arg->width == -1 && PyErr_Occurred())
14317 return -1;
14318 if (arg->width < 0) {
14319 arg->flags |= F_LJUST;
14320 arg->width = -arg->width;
14321 }
14322 if (--ctx->fmtcnt >= 0) {
14323 arg->ch = FORMAT_READ(ctx);
14324 ctx->fmtpos++;
14325 }
14326 }
14327 else if (arg->ch >= '0' && arg->ch <= '9') {
14328 arg->width = arg->ch - '0';
14329 while (--ctx->fmtcnt >= 0) {
14330 arg->ch = FORMAT_READ(ctx);
14331 ctx->fmtpos++;
14332 if (arg->ch < '0' || arg->ch > '9')
14333 break;
14334 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14335 mixing signed and unsigned comparison. Since arg->ch is between
14336 '0' and '9', casting to int is safe. */
14337 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14338 PyErr_SetString(PyExc_ValueError,
14339 "width too big");
14340 return -1;
14341 }
14342 arg->width = arg->width*10 + (arg->ch - '0');
14343 }
14344 }
14345
14346 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014347 if (arg->ch == '.') {
14348 arg->prec = 0;
14349 if (--ctx->fmtcnt >= 0) {
14350 arg->ch = FORMAT_READ(ctx);
14351 ctx->fmtpos++;
14352 }
14353 if (arg->ch == '*') {
14354 v = unicode_format_getnextarg(ctx);
14355 if (v == NULL)
14356 return -1;
14357 if (!PyLong_Check(v)) {
14358 PyErr_SetString(PyExc_TypeError,
14359 "* wants int");
14360 return -1;
14361 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014362 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014363 if (arg->prec == -1 && PyErr_Occurred())
14364 return -1;
14365 if (arg->prec < 0)
14366 arg->prec = 0;
14367 if (--ctx->fmtcnt >= 0) {
14368 arg->ch = FORMAT_READ(ctx);
14369 ctx->fmtpos++;
14370 }
14371 }
14372 else if (arg->ch >= '0' && arg->ch <= '9') {
14373 arg->prec = arg->ch - '0';
14374 while (--ctx->fmtcnt >= 0) {
14375 arg->ch = FORMAT_READ(ctx);
14376 ctx->fmtpos++;
14377 if (arg->ch < '0' || arg->ch > '9')
14378 break;
14379 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14380 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014381 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014382 return -1;
14383 }
14384 arg->prec = arg->prec*10 + (arg->ch - '0');
14385 }
14386 }
14387 }
14388
14389 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14390 if (ctx->fmtcnt >= 0) {
14391 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14392 if (--ctx->fmtcnt >= 0) {
14393 arg->ch = FORMAT_READ(ctx);
14394 ctx->fmtpos++;
14395 }
14396 }
14397 }
14398 if (ctx->fmtcnt < 0) {
14399 PyErr_SetString(PyExc_ValueError,
14400 "incomplete format");
14401 return -1;
14402 }
14403 return 0;
14404
14405#undef FORMAT_READ
14406}
14407
14408/* Format one argument. Supported conversion specifiers:
14409
14410 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014411 - "i", "d", "u": int or float
14412 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014413 - "e", "E", "f", "F", "g", "G": float
14414 - "c": int or str (1 character)
14415
Victor Stinner8dbd4212012-12-04 09:30:24 +010014416 When possible, the output is written directly into the Unicode writer
14417 (ctx->writer). A string is created when padding is required.
14418
Victor Stinnera47082312012-10-04 02:19:54 +020014419 Return 0 if the argument has been formatted into *p_str,
14420 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014421 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014422static int
14423unicode_format_arg_format(struct unicode_formatter_t *ctx,
14424 struct unicode_format_arg_t *arg,
14425 PyObject **p_str)
14426{
14427 PyObject *v;
14428 _PyUnicodeWriter *writer = &ctx->writer;
14429
14430 if (ctx->fmtcnt == 0)
14431 ctx->writer.overallocate = 0;
14432
14433 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014434 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014435 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014436 return 1;
14437 }
14438
14439 v = unicode_format_getnextarg(ctx);
14440 if (v == NULL)
14441 return -1;
14442
Victor Stinnera47082312012-10-04 02:19:54 +020014443
14444 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014445 case 's':
14446 case 'r':
14447 case 'a':
14448 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14449 /* Fast path */
14450 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14451 return -1;
14452 return 1;
14453 }
14454
14455 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14456 *p_str = v;
14457 Py_INCREF(*p_str);
14458 }
14459 else {
14460 if (arg->ch == 's')
14461 *p_str = PyObject_Str(v);
14462 else if (arg->ch == 'r')
14463 *p_str = PyObject_Repr(v);
14464 else
14465 *p_str = PyObject_ASCII(v);
14466 }
14467 break;
14468
14469 case 'i':
14470 case 'd':
14471 case 'u':
14472 case 'o':
14473 case 'x':
14474 case 'X':
14475 {
14476 int ret = mainformatlong(v, arg, p_str, writer);
14477 if (ret != 0)
14478 return ret;
14479 arg->sign = 1;
14480 break;
14481 }
14482
14483 case 'e':
14484 case 'E':
14485 case 'f':
14486 case 'F':
14487 case 'g':
14488 case 'G':
14489 if (arg->width == -1 && arg->prec == -1
14490 && !(arg->flags & (F_SIGN | F_BLANK)))
14491 {
14492 /* Fast path */
14493 if (formatfloat(v, arg, NULL, writer) == -1)
14494 return -1;
14495 return 1;
14496 }
14497
14498 arg->sign = 1;
14499 if (formatfloat(v, arg, p_str, NULL) == -1)
14500 return -1;
14501 break;
14502
14503 case 'c':
14504 {
14505 Py_UCS4 ch = formatchar(v);
14506 if (ch == (Py_UCS4) -1)
14507 return -1;
14508 if (arg->width == -1 && arg->prec == -1) {
14509 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014510 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014511 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014512 return 1;
14513 }
14514 *p_str = PyUnicode_FromOrdinal(ch);
14515 break;
14516 }
14517
14518 default:
14519 PyErr_Format(PyExc_ValueError,
14520 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014521 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014522 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14523 (int)arg->ch,
14524 ctx->fmtpos - 1);
14525 return -1;
14526 }
14527 if (*p_str == NULL)
14528 return -1;
14529 assert (PyUnicode_Check(*p_str));
14530 return 0;
14531}
14532
14533static int
14534unicode_format_arg_output(struct unicode_formatter_t *ctx,
14535 struct unicode_format_arg_t *arg,
14536 PyObject *str)
14537{
14538 Py_ssize_t len;
14539 enum PyUnicode_Kind kind;
14540 void *pbuf;
14541 Py_ssize_t pindex;
14542 Py_UCS4 signchar;
14543 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014544 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014545 Py_ssize_t sublen;
14546 _PyUnicodeWriter *writer = &ctx->writer;
14547 Py_UCS4 fill;
14548
14549 fill = ' ';
14550 if (arg->sign && arg->flags & F_ZERO)
14551 fill = '0';
14552
14553 if (PyUnicode_READY(str) == -1)
14554 return -1;
14555
14556 len = PyUnicode_GET_LENGTH(str);
14557 if ((arg->width == -1 || arg->width <= len)
14558 && (arg->prec == -1 || arg->prec >= len)
14559 && !(arg->flags & (F_SIGN | F_BLANK)))
14560 {
14561 /* Fast path */
14562 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14563 return -1;
14564 return 0;
14565 }
14566
14567 /* Truncate the string for "s", "r" and "a" formats
14568 if the precision is set */
14569 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14570 if (arg->prec >= 0 && len > arg->prec)
14571 len = arg->prec;
14572 }
14573
14574 /* Adjust sign and width */
14575 kind = PyUnicode_KIND(str);
14576 pbuf = PyUnicode_DATA(str);
14577 pindex = 0;
14578 signchar = '\0';
14579 if (arg->sign) {
14580 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14581 if (ch == '-' || ch == '+') {
14582 signchar = ch;
14583 len--;
14584 pindex++;
14585 }
14586 else if (arg->flags & F_SIGN)
14587 signchar = '+';
14588 else if (arg->flags & F_BLANK)
14589 signchar = ' ';
14590 else
14591 arg->sign = 0;
14592 }
14593 if (arg->width < len)
14594 arg->width = len;
14595
14596 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014597 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014598 if (!(arg->flags & F_LJUST)) {
14599 if (arg->sign) {
14600 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014601 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014602 }
14603 else {
14604 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014605 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014606 }
14607 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014608 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14609 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014610 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014611 }
14612
Victor Stinnera47082312012-10-04 02:19:54 +020014613 buflen = arg->width;
14614 if (arg->sign && len == arg->width)
14615 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014616 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014617 return -1;
14618
14619 /* Write the sign if needed */
14620 if (arg->sign) {
14621 if (fill != ' ') {
14622 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14623 writer->pos += 1;
14624 }
14625 if (arg->width > len)
14626 arg->width--;
14627 }
14628
14629 /* Write the numeric prefix for "x", "X" and "o" formats
14630 if the alternate form is used.
14631 For example, write "0x" for the "%#x" format. */
14632 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14633 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14634 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14635 if (fill != ' ') {
14636 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14637 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14638 writer->pos += 2;
14639 pindex += 2;
14640 }
14641 arg->width -= 2;
14642 if (arg->width < 0)
14643 arg->width = 0;
14644 len -= 2;
14645 }
14646
14647 /* Pad left with the fill character if needed */
14648 if (arg->width > len && !(arg->flags & F_LJUST)) {
14649 sublen = arg->width - len;
14650 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14651 writer->pos += sublen;
14652 arg->width = len;
14653 }
14654
14655 /* If padding with spaces: write sign if needed and/or numeric prefix if
14656 the alternate form is used */
14657 if (fill == ' ') {
14658 if (arg->sign) {
14659 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14660 writer->pos += 1;
14661 }
14662 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14663 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14664 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14665 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14666 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14667 writer->pos += 2;
14668 pindex += 2;
14669 }
14670 }
14671
14672 /* Write characters */
14673 if (len) {
14674 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14675 str, pindex, len);
14676 writer->pos += len;
14677 }
14678
14679 /* Pad right with the fill character if needed */
14680 if (arg->width > len) {
14681 sublen = arg->width - len;
14682 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14683 writer->pos += sublen;
14684 }
14685 return 0;
14686}
14687
14688/* Helper of PyUnicode_Format(): format one arg.
14689 Return 0 on success, raise an exception and return -1 on error. */
14690static int
14691unicode_format_arg(struct unicode_formatter_t *ctx)
14692{
14693 struct unicode_format_arg_t arg;
14694 PyObject *str;
14695 int ret;
14696
Victor Stinner8dbd4212012-12-04 09:30:24 +010014697 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14698 arg.flags = 0;
14699 arg.width = -1;
14700 arg.prec = -1;
14701 arg.sign = 0;
14702 str = NULL;
14703
Victor Stinnera47082312012-10-04 02:19:54 +020014704 ret = unicode_format_arg_parse(ctx, &arg);
14705 if (ret == -1)
14706 return -1;
14707
14708 ret = unicode_format_arg_format(ctx, &arg, &str);
14709 if (ret == -1)
14710 return -1;
14711
14712 if (ret != 1) {
14713 ret = unicode_format_arg_output(ctx, &arg, str);
14714 Py_DECREF(str);
14715 if (ret == -1)
14716 return -1;
14717 }
14718
14719 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14720 PyErr_SetString(PyExc_TypeError,
14721 "not all arguments converted during string formatting");
14722 return -1;
14723 }
14724 return 0;
14725}
14726
Alexander Belopolsky40018472011-02-26 01:02:56 +000014727PyObject *
14728PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014729{
Victor Stinnera47082312012-10-04 02:19:54 +020014730 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014731
Guido van Rossumd57fd912000-03-10 22:53:23 +000014732 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014733 PyErr_BadInternalCall();
14734 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014735 }
Victor Stinnera47082312012-10-04 02:19:54 +020014736
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014737 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014738 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014739
14740 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014741 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14742 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14743 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14744 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014745
Victor Stinner8f674cc2013-04-17 23:02:17 +020014746 _PyUnicodeWriter_Init(&ctx.writer);
14747 ctx.writer.min_length = ctx.fmtcnt + 100;
14748 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014749
Guido van Rossumd57fd912000-03-10 22:53:23 +000014750 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014751 ctx.arglen = PyTuple_Size(args);
14752 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014753 }
14754 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014755 ctx.arglen = -1;
14756 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014757 }
Victor Stinnera47082312012-10-04 02:19:54 +020014758 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014759 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014760 ctx.dict = args;
14761 else
14762 ctx.dict = NULL;
14763 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014764
Victor Stinnera47082312012-10-04 02:19:54 +020014765 while (--ctx.fmtcnt >= 0) {
14766 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014767 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014768
14769 nonfmtpos = ctx.fmtpos++;
14770 while (ctx.fmtcnt >= 0 &&
14771 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14772 ctx.fmtpos++;
14773 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014774 }
Victor Stinnera47082312012-10-04 02:19:54 +020014775 if (ctx.fmtcnt < 0) {
14776 ctx.fmtpos--;
14777 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014778 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014779
Victor Stinnercfc4c132013-04-03 01:48:39 +020014780 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14781 nonfmtpos, ctx.fmtpos) < 0)
14782 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014783 }
14784 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014785 ctx.fmtpos++;
14786 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014787 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014788 }
14789 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014790
Victor Stinnera47082312012-10-04 02:19:54 +020014791 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014792 PyErr_SetString(PyExc_TypeError,
14793 "not all arguments converted during string formatting");
14794 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014795 }
14796
Victor Stinnera47082312012-10-04 02:19:54 +020014797 if (ctx.args_owned) {
14798 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014799 }
Victor Stinnera47082312012-10-04 02:19:54 +020014800 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014801
Benjamin Peterson29060642009-01-31 22:14:21 +000014802 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014803 _PyUnicodeWriter_Dealloc(&ctx.writer);
14804 if (ctx.args_owned) {
14805 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014806 }
14807 return NULL;
14808}
14809
Jeremy Hylton938ace62002-07-17 16:30:39 +000014810static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014811unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14812
Tim Peters6d6c1a32001-08-02 04:15:00 +000014813static PyObject *
14814unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14815{
Benjamin Peterson29060642009-01-31 22:14:21 +000014816 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014817 static char *kwlist[] = {"object", "encoding", "errors", 0};
14818 char *encoding = NULL;
14819 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014820
Benjamin Peterson14339b62009-01-31 16:36:08 +000014821 if (type != &PyUnicode_Type)
14822 return unicode_subtype_new(type, args, kwds);
14823 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014824 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014825 return NULL;
14826 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014827 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014828 if (encoding == NULL && errors == NULL)
14829 return PyObject_Str(x);
14830 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014831 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014832}
14833
Guido van Rossume023fe02001-08-30 03:12:59 +000014834static PyObject *
14835unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14836{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014837 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014838 Py_ssize_t length, char_size;
14839 int share_wstr, share_utf8;
14840 unsigned int kind;
14841 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014842
Benjamin Peterson14339b62009-01-31 16:36:08 +000014843 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014844
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014845 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014846 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014847 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014848 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014849 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014850 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014851 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014852 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014853
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014854 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014855 if (self == NULL) {
14856 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014857 return NULL;
14858 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014859 kind = PyUnicode_KIND(unicode);
14860 length = PyUnicode_GET_LENGTH(unicode);
14861
14862 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014863#ifdef Py_DEBUG
14864 _PyUnicode_HASH(self) = -1;
14865#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014866 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014867#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014868 _PyUnicode_STATE(self).interned = 0;
14869 _PyUnicode_STATE(self).kind = kind;
14870 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014871 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014872 _PyUnicode_STATE(self).ready = 1;
14873 _PyUnicode_WSTR(self) = NULL;
14874 _PyUnicode_UTF8_LENGTH(self) = 0;
14875 _PyUnicode_UTF8(self) = NULL;
14876 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014877 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014878
14879 share_utf8 = 0;
14880 share_wstr = 0;
14881 if (kind == PyUnicode_1BYTE_KIND) {
14882 char_size = 1;
14883 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14884 share_utf8 = 1;
14885 }
14886 else if (kind == PyUnicode_2BYTE_KIND) {
14887 char_size = 2;
14888 if (sizeof(wchar_t) == 2)
14889 share_wstr = 1;
14890 }
14891 else {
14892 assert(kind == PyUnicode_4BYTE_KIND);
14893 char_size = 4;
14894 if (sizeof(wchar_t) == 4)
14895 share_wstr = 1;
14896 }
14897
14898 /* Ensure we won't overflow the length. */
14899 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14900 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014901 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014902 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014903 data = PyObject_MALLOC((length + 1) * char_size);
14904 if (data == NULL) {
14905 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014906 goto onError;
14907 }
14908
Victor Stinnerc3c74152011-10-02 20:39:55 +020014909 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014910 if (share_utf8) {
14911 _PyUnicode_UTF8_LENGTH(self) = length;
14912 _PyUnicode_UTF8(self) = data;
14913 }
14914 if (share_wstr) {
14915 _PyUnicode_WSTR_LENGTH(self) = length;
14916 _PyUnicode_WSTR(self) = (wchar_t *)data;
14917 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014918
Christian Heimesf051e432016-09-13 20:22:02 +020014919 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014920 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014921 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014922#ifdef Py_DEBUG
14923 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14924#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014925 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014926 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014927
14928onError:
14929 Py_DECREF(unicode);
14930 Py_DECREF(self);
14931 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014932}
14933
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014934PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014935"str(object='') -> str\n\
14936str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014937\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014938Create a new string object from the given object. If encoding or\n\
14939errors is specified, then the object must expose a data buffer\n\
14940that will be decoded using the given encoding and error handler.\n\
14941Otherwise, returns the result of object.__str__() (if defined)\n\
14942or repr(object).\n\
14943encoding defaults to sys.getdefaultencoding().\n\
14944errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014945
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014946static PyObject *unicode_iter(PyObject *seq);
14947
Guido van Rossumd57fd912000-03-10 22:53:23 +000014948PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014949 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014950 "str", /* tp_name */
14951 sizeof(PyUnicodeObject), /* tp_size */
14952 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014953 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014954 (destructor)unicode_dealloc, /* tp_dealloc */
14955 0, /* tp_print */
14956 0, /* tp_getattr */
14957 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014958 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014959 unicode_repr, /* tp_repr */
14960 &unicode_as_number, /* tp_as_number */
14961 &unicode_as_sequence, /* tp_as_sequence */
14962 &unicode_as_mapping, /* tp_as_mapping */
14963 (hashfunc) unicode_hash, /* tp_hash*/
14964 0, /* tp_call*/
14965 (reprfunc) unicode_str, /* tp_str */
14966 PyObject_GenericGetAttr, /* tp_getattro */
14967 0, /* tp_setattro */
14968 0, /* tp_as_buffer */
14969 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014970 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014971 unicode_doc, /* tp_doc */
14972 0, /* tp_traverse */
14973 0, /* tp_clear */
14974 PyUnicode_RichCompare, /* tp_richcompare */
14975 0, /* tp_weaklistoffset */
14976 unicode_iter, /* tp_iter */
14977 0, /* tp_iternext */
14978 unicode_methods, /* tp_methods */
14979 0, /* tp_members */
14980 0, /* tp_getset */
14981 &PyBaseObject_Type, /* tp_base */
14982 0, /* tp_dict */
14983 0, /* tp_descr_get */
14984 0, /* tp_descr_set */
14985 0, /* tp_dictoffset */
14986 0, /* tp_init */
14987 0, /* tp_alloc */
14988 unicode_new, /* tp_new */
14989 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014990};
14991
14992/* Initialize the Unicode implementation */
14993
Victor Stinner3a50e702011-10-18 21:21:00 +020014994int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014995{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014996 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014997 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014998 0x000A, /* LINE FEED */
14999 0x000D, /* CARRIAGE RETURN */
15000 0x001C, /* FILE SEPARATOR */
15001 0x001D, /* GROUP SEPARATOR */
15002 0x001E, /* RECORD SEPARATOR */
15003 0x0085, /* NEXT LINE */
15004 0x2028, /* LINE SEPARATOR */
15005 0x2029, /* PARAGRAPH SEPARATOR */
15006 };
15007
Fred Drakee4315f52000-05-09 19:53:39 +000015008 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015009 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015010 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015011 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015012 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015013
Guido van Rossumcacfc072002-05-24 19:01:59 +000015014 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015015 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015016
15017 /* initialize the linebreak bloom filter */
15018 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015019 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015020 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015021
Christian Heimes26532f72013-07-20 14:57:16 +020015022 if (PyType_Ready(&EncodingMapType) < 0)
15023 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015024
Benjamin Petersonc4311282012-10-30 23:21:10 -040015025 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15026 Py_FatalError("Can't initialize field name iterator type");
15027
15028 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15029 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015030
Victor Stinner3a50e702011-10-18 21:21:00 +020015031 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015032}
15033
15034/* Finalize the Unicode implementation */
15035
Christian Heimesa156e092008-02-16 07:38:31 +000015036int
15037PyUnicode_ClearFreeList(void)
15038{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015039 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015040}
15041
Guido van Rossumd57fd912000-03-10 22:53:23 +000015042void
Thomas Wouters78890102000-07-22 19:25:51 +000015043_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015044{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015045 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015046
Serhiy Storchaka05997252013-01-26 12:14:02 +020015047 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015048
Serhiy Storchaka05997252013-01-26 12:14:02 +020015049 for (i = 0; i < 256; i++)
15050 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015051 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015052 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015053}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015054
Walter Dörwald16807132007-05-25 13:52:07 +000015055void
15056PyUnicode_InternInPlace(PyObject **p)
15057{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015058 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015059 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015060#ifdef Py_DEBUG
15061 assert(s != NULL);
15062 assert(_PyUnicode_CHECK(s));
15063#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015064 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015065 return;
15066#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015067 /* If it's a subclass, we don't really know what putting
15068 it in the interned dict might do. */
15069 if (!PyUnicode_CheckExact(s))
15070 return;
15071 if (PyUnicode_CHECK_INTERNED(s))
15072 return;
15073 if (interned == NULL) {
15074 interned = PyDict_New();
15075 if (interned == NULL) {
15076 PyErr_Clear(); /* Don't leave an exception */
15077 return;
15078 }
15079 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015080 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015081 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015082 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015083 if (t == NULL) {
15084 PyErr_Clear();
15085 return;
15086 }
15087 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015088 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015089 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015090 return;
15091 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015092 /* The two references in interned are not counted by refcnt.
15093 The deallocator will take care of this */
15094 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015095 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015096}
15097
15098void
15099PyUnicode_InternImmortal(PyObject **p)
15100{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015101 PyUnicode_InternInPlace(p);
15102 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015103 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015104 Py_INCREF(*p);
15105 }
Walter Dörwald16807132007-05-25 13:52:07 +000015106}
15107
15108PyObject *
15109PyUnicode_InternFromString(const char *cp)
15110{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015111 PyObject *s = PyUnicode_FromString(cp);
15112 if (s == NULL)
15113 return NULL;
15114 PyUnicode_InternInPlace(&s);
15115 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015116}
15117
Alexander Belopolsky40018472011-02-26 01:02:56 +000015118void
15119_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015120{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015121 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015122 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015123 Py_ssize_t i, n;
15124 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015125
Benjamin Peterson14339b62009-01-31 16:36:08 +000015126 if (interned == NULL || !PyDict_Check(interned))
15127 return;
15128 keys = PyDict_Keys(interned);
15129 if (keys == NULL || !PyList_Check(keys)) {
15130 PyErr_Clear();
15131 return;
15132 }
Walter Dörwald16807132007-05-25 13:52:07 +000015133
Benjamin Peterson14339b62009-01-31 16:36:08 +000015134 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15135 detector, interned unicode strings are not forcibly deallocated;
15136 rather, we give them their stolen references back, and then clear
15137 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015138
Benjamin Peterson14339b62009-01-31 16:36:08 +000015139 n = PyList_GET_SIZE(keys);
15140 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015141 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015142 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015143 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015144 if (PyUnicode_READY(s) == -1) {
15145 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015146 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015147 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015148 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015149 case SSTATE_NOT_INTERNED:
15150 /* XXX Shouldn't happen */
15151 break;
15152 case SSTATE_INTERNED_IMMORTAL:
15153 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015154 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015155 break;
15156 case SSTATE_INTERNED_MORTAL:
15157 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015158 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015159 break;
15160 default:
15161 Py_FatalError("Inconsistent interned string state.");
15162 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015163 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015164 }
15165 fprintf(stderr, "total size of all interned strings: "
15166 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15167 "mortal/immortal\n", mortal_size, immortal_size);
15168 Py_DECREF(keys);
15169 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015170 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015171}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015172
15173
15174/********************* Unicode Iterator **************************/
15175
15176typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015177 PyObject_HEAD
15178 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015179 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015180} unicodeiterobject;
15181
15182static void
15183unicodeiter_dealloc(unicodeiterobject *it)
15184{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015185 _PyObject_GC_UNTRACK(it);
15186 Py_XDECREF(it->it_seq);
15187 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015188}
15189
15190static int
15191unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15192{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015193 Py_VISIT(it->it_seq);
15194 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015195}
15196
15197static PyObject *
15198unicodeiter_next(unicodeiterobject *it)
15199{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015200 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015201
Benjamin Peterson14339b62009-01-31 16:36:08 +000015202 assert(it != NULL);
15203 seq = it->it_seq;
15204 if (seq == NULL)
15205 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015206 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015208 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15209 int kind = PyUnicode_KIND(seq);
15210 void *data = PyUnicode_DATA(seq);
15211 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15212 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015213 if (item != NULL)
15214 ++it->it_index;
15215 return item;
15216 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015217
Benjamin Peterson14339b62009-01-31 16:36:08 +000015218 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015219 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015220 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015221}
15222
15223static PyObject *
15224unicodeiter_len(unicodeiterobject *it)
15225{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015226 Py_ssize_t len = 0;
15227 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015228 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015229 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015230}
15231
15232PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15233
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015234static PyObject *
15235unicodeiter_reduce(unicodeiterobject *it)
15236{
15237 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015238 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015239 it->it_seq, it->it_index);
15240 } else {
15241 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15242 if (u == NULL)
15243 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015244 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015245 }
15246}
15247
15248PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15249
15250static PyObject *
15251unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15252{
15253 Py_ssize_t index = PyLong_AsSsize_t(state);
15254 if (index == -1 && PyErr_Occurred())
15255 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015256 if (it->it_seq != NULL) {
15257 if (index < 0)
15258 index = 0;
15259 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15260 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15261 it->it_index = index;
15262 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015263 Py_RETURN_NONE;
15264}
15265
15266PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15267
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015268static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015269 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015270 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015271 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15272 reduce_doc},
15273 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15274 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015275 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015276};
15277
15278PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015279 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15280 "str_iterator", /* tp_name */
15281 sizeof(unicodeiterobject), /* tp_basicsize */
15282 0, /* tp_itemsize */
15283 /* methods */
15284 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15285 0, /* tp_print */
15286 0, /* tp_getattr */
15287 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015288 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015289 0, /* tp_repr */
15290 0, /* tp_as_number */
15291 0, /* tp_as_sequence */
15292 0, /* tp_as_mapping */
15293 0, /* tp_hash */
15294 0, /* tp_call */
15295 0, /* tp_str */
15296 PyObject_GenericGetAttr, /* tp_getattro */
15297 0, /* tp_setattro */
15298 0, /* tp_as_buffer */
15299 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15300 0, /* tp_doc */
15301 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15302 0, /* tp_clear */
15303 0, /* tp_richcompare */
15304 0, /* tp_weaklistoffset */
15305 PyObject_SelfIter, /* tp_iter */
15306 (iternextfunc)unicodeiter_next, /* tp_iternext */
15307 unicodeiter_methods, /* tp_methods */
15308 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015309};
15310
15311static PyObject *
15312unicode_iter(PyObject *seq)
15313{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015314 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015315
Benjamin Peterson14339b62009-01-31 16:36:08 +000015316 if (!PyUnicode_Check(seq)) {
15317 PyErr_BadInternalCall();
15318 return NULL;
15319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015320 if (PyUnicode_READY(seq) == -1)
15321 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015322 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15323 if (it == NULL)
15324 return NULL;
15325 it->it_index = 0;
15326 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015327 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015328 _PyObject_GC_TRACK(it);
15329 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015330}
15331
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015332
15333size_t
15334Py_UNICODE_strlen(const Py_UNICODE *u)
15335{
15336 int res = 0;
15337 while(*u++)
15338 res++;
15339 return res;
15340}
15341
15342Py_UNICODE*
15343Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15344{
15345 Py_UNICODE *u = s1;
15346 while ((*u++ = *s2++));
15347 return s1;
15348}
15349
15350Py_UNICODE*
15351Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15352{
15353 Py_UNICODE *u = s1;
15354 while ((*u++ = *s2++))
15355 if (n-- == 0)
15356 break;
15357 return s1;
15358}
15359
15360Py_UNICODE*
15361Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15362{
15363 Py_UNICODE *u1 = s1;
15364 u1 += Py_UNICODE_strlen(u1);
15365 Py_UNICODE_strcpy(u1, s2);
15366 return s1;
15367}
15368
15369int
15370Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15371{
15372 while (*s1 && *s2 && *s1 == *s2)
15373 s1++, s2++;
15374 if (*s1 && *s2)
15375 return (*s1 < *s2) ? -1 : +1;
15376 if (*s1)
15377 return 1;
15378 if (*s2)
15379 return -1;
15380 return 0;
15381}
15382
15383int
15384Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15385{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015386 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015387 for (; n != 0; n--) {
15388 u1 = *s1;
15389 u2 = *s2;
15390 if (u1 != u2)
15391 return (u1 < u2) ? -1 : +1;
15392 if (u1 == '\0')
15393 return 0;
15394 s1++;
15395 s2++;
15396 }
15397 return 0;
15398}
15399
15400Py_UNICODE*
15401Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15402{
15403 const Py_UNICODE *p;
15404 for (p = s; *p; p++)
15405 if (*p == c)
15406 return (Py_UNICODE*)p;
15407 return NULL;
15408}
15409
15410Py_UNICODE*
15411Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15412{
15413 const Py_UNICODE *p;
15414 p = s + Py_UNICODE_strlen(s);
15415 while (p != s) {
15416 p--;
15417 if (*p == c)
15418 return (Py_UNICODE*)p;
15419 }
15420 return NULL;
15421}
Victor Stinner331ea922010-08-10 16:37:20 +000015422
Victor Stinner71133ff2010-09-01 23:43:53 +000015423Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015424PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015425{
Victor Stinner577db2c2011-10-11 22:12:48 +020015426 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015427 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015429 if (!PyUnicode_Check(unicode)) {
15430 PyErr_BadArgument();
15431 return NULL;
15432 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015433 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015434 if (u == NULL)
15435 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015436 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015437 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015438 PyErr_NoMemory();
15439 return NULL;
15440 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015441 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015442 size *= sizeof(Py_UNICODE);
15443 copy = PyMem_Malloc(size);
15444 if (copy == NULL) {
15445 PyErr_NoMemory();
15446 return NULL;
15447 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015448 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015449 return copy;
15450}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015451
Georg Brandl66c221e2010-10-14 07:04:07 +000015452/* A _string module, to export formatter_parser and formatter_field_name_split
15453 to the string.Formatter class implemented in Python. */
15454
15455static PyMethodDef _string_methods[] = {
15456 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15457 METH_O, PyDoc_STR("split the argument as a field name")},
15458 {"formatter_parser", (PyCFunction) formatter_parser,
15459 METH_O, PyDoc_STR("parse the argument as a format string")},
15460 {NULL, NULL}
15461};
15462
15463static struct PyModuleDef _string_module = {
15464 PyModuleDef_HEAD_INIT,
15465 "_string",
15466 PyDoc_STR("string helper module"),
15467 0,
15468 _string_methods,
15469 NULL,
15470 NULL,
15471 NULL,
15472 NULL
15473};
15474
15475PyMODINIT_FUNC
15476PyInit__string(void)
15477{
15478 return PyModule_Create(&_string_module);
15479}
15480
15481
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015482#ifdef __cplusplus
15483}
15484#endif