blob: 7984454856f8a53d204a9f1e774fd350fbd51cb0 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700207static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200321 }
322 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200323 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 }
325 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200326 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200327 }
328 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200329 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200330 }
331 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200332 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200333 }
334 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200335 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 }
337 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200338 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200339 }
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_OTHER;
341}
342
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300343/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000345Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000346PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000348#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 /* This is actually an illegal character, so it should
352 not be passed to unichr. */
353 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354#endif
355}
356
Victor Stinner910337b2011-10-03 03:20:16 +0200357#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200358int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100359_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200360{
361 PyASCIIObject *ascii;
362 unsigned int kind;
363
364 assert(PyUnicode_Check(op));
365
366 ascii = (PyASCIIObject *)op;
367 kind = ascii->state.kind;
368
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200370 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 assert(ascii->state.ready == 1);
372 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200375 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200376
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 if (ascii->state.compact == 1) {
378 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200379 assert(kind == PyUnicode_1BYTE_KIND
380 || kind == PyUnicode_2BYTE_KIND
381 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200384 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100385 }
386 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389 data = unicode->data.any;
390 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100391 assert(ascii->length == 0);
392 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 assert(ascii->state.compact == 0);
394 assert(ascii->state.ascii == 0);
395 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200397 assert(ascii->wstr != NULL);
398 assert(data == NULL);
399 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 }
401 else {
402 assert(kind == PyUnicode_1BYTE_KIND
403 || kind == PyUnicode_2BYTE_KIND
404 || kind == PyUnicode_4BYTE_KIND);
405 assert(ascii->state.compact == 0);
406 assert(ascii->state.ready == 1);
407 assert(data != NULL);
408 if (ascii->state.ascii) {
409 assert (compact->utf8 == data);
410 assert (compact->utf8_length == ascii->length);
411 }
412 else
413 assert (compact->utf8 != data);
414 }
415 }
416 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200417 if (
418#if SIZEOF_WCHAR_T == 2
419 kind == PyUnicode_2BYTE_KIND
420#else
421 kind == PyUnicode_4BYTE_KIND
422#endif
423 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 {
425 assert(ascii->wstr == data);
426 assert(compact->wstr_length == ascii->length);
427 } else
428 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200429 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200430
431 if (compact->utf8 == NULL)
432 assert(compact->utf8_length == 0);
433 if (ascii->wstr == NULL)
434 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200435 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200436 /* check that the best kind is used */
437 if (check_content && kind != PyUnicode_WCHAR_KIND)
438 {
439 Py_ssize_t i;
440 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200441 void *data;
442 Py_UCS4 ch;
443
444 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 for (i=0; i < ascii->length; i++)
446 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200447 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200448 if (ch > maxchar)
449 maxchar = ch;
450 }
451 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100452 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 255);
455 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200456 else
457 assert(maxchar < 128);
458 }
Victor Stinner77faf692011-11-20 18:56:05 +0100459 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200460 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100461 assert(maxchar <= 0xFFFF);
462 }
463 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100465 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200467 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200468 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400469 return 1;
470}
Victor Stinner910337b2011-10-03 03:20:16 +0200471#endif
472
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100473static PyObject*
474unicode_result_wchar(PyObject *unicode)
475{
476#ifndef Py_DEBUG
477 Py_ssize_t len;
478
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100479 len = _PyUnicode_WSTR_LENGTH(unicode);
480 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200482 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100483 }
484
485 if (len == 1) {
486 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100487 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489 Py_DECREF(unicode);
490 return latin1_char;
491 }
492 }
493
494 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200495 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 return NULL;
497 }
498#else
Victor Stinneraa771272012-10-04 02:32:58 +0200499 assert(Py_REFCNT(unicode) == 1);
500
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 /* don't make the result ready in debug mode to ensure that the caller
502 makes the string ready before using it */
503 assert(_PyUnicode_CheckConsistency(unicode, 1));
504#endif
505 return unicode;
506}
507
508static PyObject*
509unicode_result_ready(PyObject *unicode)
510{
511 Py_ssize_t length;
512
513 length = PyUnicode_GET_LENGTH(unicode);
514 if (length == 0) {
515 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100516 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200517 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 }
519 return unicode_empty;
520 }
521
522 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200523 void *data = PyUnicode_DATA(unicode);
524 int kind = PyUnicode_KIND(unicode);
525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 if (ch < 256) {
527 PyObject *latin1_char = unicode_latin1[ch];
528 if (latin1_char != NULL) {
529 if (unicode != latin1_char) {
530 Py_INCREF(latin1_char);
531 Py_DECREF(unicode);
532 }
533 return latin1_char;
534 }
535 else {
536 assert(_PyUnicode_CheckConsistency(unicode, 1));
537 Py_INCREF(unicode);
538 unicode_latin1[ch] = unicode;
539 return unicode;
540 }
541 }
542 }
543
544 assert(_PyUnicode_CheckConsistency(unicode, 1));
545 return unicode;
546}
547
548static PyObject*
549unicode_result(PyObject *unicode)
550{
551 assert(_PyUnicode_CHECK(unicode));
552 if (PyUnicode_IS_READY(unicode))
553 return unicode_result_ready(unicode);
554 else
555 return unicode_result_wchar(unicode);
556}
557
Victor Stinnerc4b49542011-12-11 22:44:26 +0100558static PyObject*
559unicode_result_unchanged(PyObject *unicode)
560{
561 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500562 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563 return NULL;
564 Py_INCREF(unicode);
565 return unicode;
566 }
567 else
568 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100569 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100570}
571
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573 ASCII, Latin1, UTF-8, etc. */
574static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200575backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577{
Victor Stinnerad771582015-10-09 12:38:53 +0200578 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200579 Py_UCS4 ch;
580 enum PyUnicode_Kind kind;
581 void *data;
582
583 assert(PyUnicode_IS_READY(unicode));
584 kind = PyUnicode_KIND(unicode);
585 data = PyUnicode_DATA(unicode);
586
587 size = 0;
588 /* determine replacement size */
589 for (i = collstart; i < collend; ++i) {
590 Py_ssize_t incr;
591
592 ch = PyUnicode_READ(kind, data, i);
593 if (ch < 0x100)
594 incr = 2+2;
595 else if (ch < 0x10000)
596 incr = 2+4;
597 else {
598 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200599 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200600 }
601 if (size > PY_SSIZE_T_MAX - incr) {
602 PyErr_SetString(PyExc_OverflowError,
603 "encoded result is too long for a Python string");
604 return NULL;
605 }
606 size += incr;
607 }
608
Victor Stinnerad771582015-10-09 12:38:53 +0200609 str = _PyBytesWriter_Prepare(writer, str, size);
610 if (str == NULL)
611 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200612
613 /* generate replacement */
614 for (i = collstart; i < collend; ++i) {
615 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200616 *str++ = '\\';
617 if (ch >= 0x00010000) {
618 *str++ = 'U';
619 *str++ = Py_hexdigits[(ch>>28)&0xf];
620 *str++ = Py_hexdigits[(ch>>24)&0xf];
621 *str++ = Py_hexdigits[(ch>>20)&0xf];
622 *str++ = Py_hexdigits[(ch>>16)&0xf];
623 *str++ = Py_hexdigits[(ch>>12)&0xf];
624 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 }
Victor Stinner797485e2015-10-09 03:17:30 +0200626 else if (ch >= 0x100) {
627 *str++ = 'u';
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
630 }
631 else
632 *str++ = 'x';
633 *str++ = Py_hexdigits[(ch>>4)&0xf];
634 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200635 }
636 return str;
637}
638
639/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640 ASCII, Latin1, UTF-8, etc. */
641static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200642xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644{
Victor Stinnerad771582015-10-09 12:38:53 +0200645 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 Py_UCS4 ch;
647 enum PyUnicode_Kind kind;
648 void *data;
649
650 assert(PyUnicode_IS_READY(unicode));
651 kind = PyUnicode_KIND(unicode);
652 data = PyUnicode_DATA(unicode);
653
654 size = 0;
655 /* determine replacement size */
656 for (i = collstart; i < collend; ++i) {
657 Py_ssize_t incr;
658
659 ch = PyUnicode_READ(kind, data, i);
660 if (ch < 10)
661 incr = 2+1+1;
662 else if (ch < 100)
663 incr = 2+2+1;
664 else if (ch < 1000)
665 incr = 2+3+1;
666 else if (ch < 10000)
667 incr = 2+4+1;
668 else if (ch < 100000)
669 incr = 2+5+1;
670 else if (ch < 1000000)
671 incr = 2+6+1;
672 else {
673 assert(ch <= MAX_UNICODE);
674 incr = 2+7+1;
675 }
676 if (size > PY_SSIZE_T_MAX - incr) {
677 PyErr_SetString(PyExc_OverflowError,
678 "encoded result is too long for a Python string");
679 return NULL;
680 }
681 size += incr;
682 }
683
Victor Stinnerad771582015-10-09 12:38:53 +0200684 str = _PyBytesWriter_Prepare(writer, str, size);
685 if (str == NULL)
686 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200687
688 /* generate replacement */
689 for (i = collstart; i < collend; ++i) {
690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691 }
692 return str;
693}
694
Thomas Wouters477c8d52006-05-27 19:21:47 +0000695/* --- Bloom Filters ----------------------------------------------------- */
696
697/* stuff to implement simple "bloom filters" for Unicode characters.
698 to keep things simple, we use a single bitmask, using the least 5
699 bits from each unicode characters as the bit index. */
700
701/* the linebreak mask is set up by Unicode_Init below */
702
Antoine Pitrouf068f942010-01-13 14:19:12 +0000703#if LONG_BIT >= 128
704#define BLOOM_WIDTH 128
705#elif LONG_BIT >= 64
706#define BLOOM_WIDTH 64
707#elif LONG_BIT >= 32
708#define BLOOM_WIDTH 32
709#else
710#error "LONG_BIT is smaller than 32"
711#endif
712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713#define BLOOM_MASK unsigned long
714
Serhiy Storchaka05997252013-01-26 12:14:02 +0200715static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000716
Antoine Pitrouf068f942010-01-13 14:19:12 +0000717#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718
Benjamin Peterson29060642009-01-31 22:14:21 +0000719#define BLOOM_LINEBREAK(ch) \
720 ((ch) < 128U ? ascii_linebreak[(ch)] : \
721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000722
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700723static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000725{
Victor Stinnera85af502013-04-09 21:53:54 +0200726#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
727 do { \
728 TYPE *data = (TYPE *)PTR; \
729 TYPE *end = data + LEN; \
730 Py_UCS4 ch; \
731 for (; data != end; data++) { \
732 ch = *data; \
733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734 } \
735 break; \
736 } while (0)
737
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738 /* calculate simple bloom-style bitmask for a given unicode string */
739
Antoine Pitrouf068f942010-01-13 14:19:12 +0000740 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741
742 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200743 switch (kind) {
744 case PyUnicode_1BYTE_KIND:
745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746 break;
747 case PyUnicode_2BYTE_KIND:
748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749 break;
750 case PyUnicode_4BYTE_KIND:
751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752 break;
753 default:
754 assert(0);
755 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000756 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200757
758#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759}
760
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300761static int
762ensure_unicode(PyObject *obj)
763{
764 if (!PyUnicode_Check(obj)) {
765 PyErr_Format(PyExc_TypeError,
766 "must be str, not %.100s",
767 Py_TYPE(obj)->tp_name);
768 return -1;
769 }
770 return PyUnicode_READY(obj);
771}
772
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773/* Compilation of templated routines */
774
775#include "stringlib/asciilib.h"
776#include "stringlib/fastsearch.h"
777#include "stringlib/partition.h"
778#include "stringlib/split.h"
779#include "stringlib/count.h"
780#include "stringlib/find.h"
781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs1lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300791#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs2lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs4lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200818#include "stringlib/unicodedefs.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100822#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824/* --- Unicode Object ----------------------------------------------------- */
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200827fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700829static inline Py_ssize_t
830findchar(const void *s, int kind,
831 Py_ssize_t size, Py_UCS4 ch,
832 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834 switch (kind) {
835 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200836 if ((Py_UCS1) ch != ch)
837 return -1;
838 if (direction > 0)
839 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
840 else
841 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200842 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200843 if ((Py_UCS2) ch != ch)
844 return -1;
845 if (direction > 0)
846 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
847 else
848 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200849 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200850 if (direction > 0)
851 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
852 else
853 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200854 default:
855 assert(0);
856 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858}
859
Victor Stinnerafffce42012-10-03 23:03:17 +0200860#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000861/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200862 earlier.
863
864 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
865 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
866 invalid character in Unicode 6.0. */
867static void
868unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
869{
870 int kind = PyUnicode_KIND(unicode);
871 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
872 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
873 if (length <= old_length)
874 return;
875 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
876}
877#endif
878
Victor Stinnerfe226c02011-10-03 03:52:20 +0200879static PyObject*
880resize_compact(PyObject *unicode, Py_ssize_t length)
881{
882 Py_ssize_t char_size;
883 Py_ssize_t struct_size;
884 Py_ssize_t new_size;
885 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100886 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200887#ifdef Py_DEBUG
888 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
889#endif
890
Victor Stinner79891572012-05-03 13:43:07 +0200891 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200892 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100893 assert(PyUnicode_IS_COMPACT(unicode));
894
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200895 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100896 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200897 struct_size = sizeof(PyASCIIObject);
898 else
899 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200900 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901
Victor Stinnerfe226c02011-10-03 03:52:20 +0200902 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
903 PyErr_NoMemory();
904 return NULL;
905 }
906 new_size = (struct_size + (length + 1) * char_size);
907
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200908 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
909 PyObject_DEL(_PyUnicode_UTF8(unicode));
910 _PyUnicode_UTF8(unicode) = NULL;
911 _PyUnicode_UTF8_LENGTH(unicode) = 0;
912 }
Victor Stinner84def372011-12-11 20:04:56 +0100913 _Py_DEC_REFTOTAL;
914 _Py_ForgetReference(unicode);
915
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300916 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100917 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100918 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200919 PyErr_NoMemory();
920 return NULL;
921 }
Victor Stinner84def372011-12-11 20:04:56 +0100922 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200923 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100924
Victor Stinnerfe226c02011-10-03 03:52:20 +0200925 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200926 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200927 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100928 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200929 _PyUnicode_WSTR_LENGTH(unicode) = length;
930 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100931 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
932 PyObject_DEL(_PyUnicode_WSTR(unicode));
933 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100934 if (!PyUnicode_IS_ASCII(unicode))
935 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100936 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200937#ifdef Py_DEBUG
938 unicode_fill_invalid(unicode, old_length);
939#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
941 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200942 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 return unicode;
944}
945
Alexander Belopolsky40018472011-02-26 01:02:56 +0000946static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200947resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948{
Victor Stinner95663112011-10-04 01:03:50 +0200949 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100950 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200952 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000953
Victor Stinnerfe226c02011-10-03 03:52:20 +0200954 if (PyUnicode_IS_READY(unicode)) {
955 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200956 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200958#ifdef Py_DEBUG
959 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
960#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961
962 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200963 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
965 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200966
967 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
968 PyErr_NoMemory();
969 return -1;
970 }
971 new_size = (length + 1) * char_size;
972
Victor Stinner7a9105a2011-12-12 00:13:42 +0100973 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
974 {
975 PyObject_DEL(_PyUnicode_UTF8(unicode));
976 _PyUnicode_UTF8(unicode) = NULL;
977 _PyUnicode_UTF8_LENGTH(unicode) = 0;
978 }
979
Victor Stinnerfe226c02011-10-03 03:52:20 +0200980 data = (PyObject *)PyObject_REALLOC(data, new_size);
981 if (data == NULL) {
982 PyErr_NoMemory();
983 return -1;
984 }
985 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200986 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200987 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200988 _PyUnicode_WSTR_LENGTH(unicode) = length;
989 }
990 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200991 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200992 _PyUnicode_UTF8_LENGTH(unicode) = length;
993 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 _PyUnicode_LENGTH(unicode) = length;
995 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 unicode_fill_invalid(unicode, old_length);
998#endif
Victor Stinner95663112011-10-04 01:03:50 +0200999 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001000 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 }
Victor Stinner95663112011-10-04 01:03:50 +02001004 assert(_PyUnicode_WSTR(unicode) != NULL);
1005
1006 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001007 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001008 PyErr_NoMemory();
1009 return -1;
1010 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001012 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001013 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001014 if (!wstr) {
1015 PyErr_NoMemory();
1016 return -1;
1017 }
1018 _PyUnicode_WSTR(unicode) = wstr;
1019 _PyUnicode_WSTR(unicode)[length] = 0;
1020 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001021 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022 return 0;
1023}
1024
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025static PyObject*
1026resize_copy(PyObject *unicode, Py_ssize_t length)
1027{
1028 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001031
Benjamin Petersonbac79492012-01-14 13:34:47 -05001032 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001033 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001034
1035 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1036 if (copy == NULL)
1037 return NULL;
1038
1039 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001040 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001042 }
1043 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001044 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001046 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 if (w == NULL)
1048 return NULL;
1049 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1050 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001051 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001052 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001053 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 }
1055}
1056
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001058 Ux0000 terminated; some code (e.g. new_identifier)
1059 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060
1061 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001062 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063
1064*/
1065
Alexander Belopolsky40018472011-02-26 01:02:56 +00001066static PyUnicodeObject *
1067_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001069 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001071
Thomas Wouters477c8d52006-05-27 19:21:47 +00001072 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073 if (length == 0 && unicode_empty != NULL) {
1074 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001075 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 }
1077
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001078 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001079 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001080 return (PyUnicodeObject *)PyErr_NoMemory();
1081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 if (length < 0) {
1083 PyErr_SetString(PyExc_SystemError,
1084 "Negative size passed to _PyUnicode_New");
1085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086 }
1087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1089 if (unicode == NULL)
1090 return NULL;
1091 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001092
1093 _PyUnicode_WSTR_LENGTH(unicode) = length;
1094 _PyUnicode_HASH(unicode) = -1;
1095 _PyUnicode_STATE(unicode).interned = 0;
1096 _PyUnicode_STATE(unicode).kind = 0;
1097 _PyUnicode_STATE(unicode).compact = 0;
1098 _PyUnicode_STATE(unicode).ready = 0;
1099 _PyUnicode_STATE(unicode).ascii = 0;
1100 _PyUnicode_DATA_ANY(unicode) = NULL;
1101 _PyUnicode_LENGTH(unicode) = 0;
1102 _PyUnicode_UTF8(unicode) = NULL;
1103 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1106 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001107 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001108 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001109 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001110 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111
Jeremy Hyltond8082792003-09-16 19:41:39 +00001112 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001113 * the caller fails before initializing str -- unicode_resize()
1114 * reads str[0], and the Keep-Alive optimization can keep memory
1115 * allocated for str alive across a call to unicode_dealloc(unicode).
1116 * We don't want unicode_resize to read uninitialized memory in
1117 * that case.
1118 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 _PyUnicode_WSTR(unicode)[0] = 0;
1120 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001121
Victor Stinner7931d9a2011-11-04 00:22:48 +01001122 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 return unicode;
1124}
1125
Victor Stinnerf42dc442011-10-02 23:33:16 +02001126static const char*
1127unicode_kind_name(PyObject *unicode)
1128{
Victor Stinner42dfd712011-10-03 14:41:45 +02001129 /* don't check consistency: unicode_kind_name() is called from
1130 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131 if (!PyUnicode_IS_COMPACT(unicode))
1132 {
1133 if (!PyUnicode_IS_READY(unicode))
1134 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001135 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001136 {
1137 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001138 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001139 return "legacy ascii";
1140 else
1141 return "legacy latin1";
1142 case PyUnicode_2BYTE_KIND:
1143 return "legacy UCS2";
1144 case PyUnicode_4BYTE_KIND:
1145 return "legacy UCS4";
1146 default:
1147 return "<legacy invalid kind>";
1148 }
1149 }
1150 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001151 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "ascii";
1155 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001156 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001157 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001158 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001159 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001160 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001161 default:
1162 return "<invalid compact kind>";
1163 }
1164}
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167/* Functions wrapping macros for use in debugger */
1168char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001169 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170}
1171
1172void *_PyUnicode_compact_data(void *unicode) {
1173 return _PyUnicode_COMPACT_DATA(unicode);
1174}
1175void *_PyUnicode_data(void *unicode){
1176 printf("obj %p\n", unicode);
1177 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1178 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1179 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1180 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1181 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1182 return PyUnicode_DATA(unicode);
1183}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001184
1185void
1186_PyUnicode_Dump(PyObject *op)
1187{
1188 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001189 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1190 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1191 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001192
Victor Stinnera849a4b2011-10-03 12:12:11 +02001193 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001194 {
1195 if (ascii->state.ascii)
1196 data = (ascii + 1);
1197 else
1198 data = (compact + 1);
1199 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001200 else
1201 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001202 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1203 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001204
Victor Stinnera849a4b2011-10-03 12:12:11 +02001205 if (ascii->wstr == data)
1206 printf("shared ");
1207 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001208
Victor Stinnera3b334d2011-10-03 13:53:37 +02001209 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001210 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001211 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1212 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001213 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1214 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001215 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001216 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001217}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218#endif
1219
1220PyObject *
1221PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1222{
1223 PyObject *obj;
1224 PyCompactUnicodeObject *unicode;
1225 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001226 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001227 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 Py_ssize_t char_size;
1229 Py_ssize_t struct_size;
1230
1231 /* Optimization for empty strings */
1232 if (size == 0 && unicode_empty != NULL) {
1233 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001234 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 }
1236
Victor Stinner9e9d6892011-10-04 01:02:02 +02001237 is_ascii = 0;
1238 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239 struct_size = sizeof(PyCompactUnicodeObject);
1240 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001241 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 char_size = 1;
1243 is_ascii = 1;
1244 struct_size = sizeof(PyASCIIObject);
1245 }
1246 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001247 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248 char_size = 1;
1249 }
1250 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001251 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 char_size = 2;
1253 if (sizeof(wchar_t) == 2)
1254 is_sharing = 1;
1255 }
1256 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001257 if (maxchar > MAX_UNICODE) {
1258 PyErr_SetString(PyExc_SystemError,
1259 "invalid maximum character passed to PyUnicode_New");
1260 return NULL;
1261 }
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 4;
1264 if (sizeof(wchar_t) == 4)
1265 is_sharing = 1;
1266 }
1267
1268 /* Ensure we won't overflow the size. */
1269 if (size < 0) {
1270 PyErr_SetString(PyExc_SystemError,
1271 "Negative size passed to PyUnicode_New");
1272 return NULL;
1273 }
1274 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1275 return PyErr_NoMemory();
1276
1277 /* Duplicated allocation code from _PyObject_New() instead of a call to
1278 * PyObject_New() so we are able to allocate space for the object and
1279 * it's data buffer.
1280 */
1281 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1282 if (obj == NULL)
1283 return PyErr_NoMemory();
1284 obj = PyObject_INIT(obj, &PyUnicode_Type);
1285 if (obj == NULL)
1286 return NULL;
1287
1288 unicode = (PyCompactUnicodeObject *)obj;
1289 if (is_ascii)
1290 data = ((PyASCIIObject*)obj) + 1;
1291 else
1292 data = unicode + 1;
1293 _PyUnicode_LENGTH(unicode) = size;
1294 _PyUnicode_HASH(unicode) = -1;
1295 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001296 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 _PyUnicode_STATE(unicode).compact = 1;
1298 _PyUnicode_STATE(unicode).ready = 1;
1299 _PyUnicode_STATE(unicode).ascii = is_ascii;
1300 if (is_ascii) {
1301 ((char*)data)[size] = 0;
1302 _PyUnicode_WSTR(unicode) = NULL;
1303 }
Victor Stinner8f825062012-04-27 13:55:39 +02001304 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 ((char*)data)[size] = 0;
1306 _PyUnicode_WSTR(unicode) = NULL;
1307 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001309 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 else {
1312 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001313 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001314 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001316 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 ((Py_UCS4*)data)[size] = 0;
1318 if (is_sharing) {
1319 _PyUnicode_WSTR_LENGTH(unicode) = size;
1320 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1321 }
1322 else {
1323 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1324 _PyUnicode_WSTR(unicode) = NULL;
1325 }
1326 }
Victor Stinner8f825062012-04-27 13:55:39 +02001327#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001328 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001329#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001330 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 return obj;
1332}
1333
1334#if SIZEOF_WCHAR_T == 2
1335/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1336 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001337 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338
1339 This function assumes that unicode can hold one more code point than wstr
1340 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001341static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001343 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344{
1345 const wchar_t *iter;
1346 Py_UCS4 *ucs4_out;
1347
Victor Stinner910337b2011-10-03 03:20:16 +02001348 assert(unicode != NULL);
1349 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1351 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1352
1353 for (iter = begin; iter < end; ) {
1354 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1355 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001356 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1357 && (iter+1) < end
1358 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 {
Victor Stinner551ac952011-11-29 22:58:13 +01001360 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 iter += 2;
1362 }
1363 else {
1364 *ucs4_out++ = *iter;
1365 iter++;
1366 }
1367 }
1368 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1369 _PyUnicode_GET_LENGTH(unicode)));
1370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371}
1372#endif
1373
Victor Stinnercd9950f2011-10-02 00:34:53 +02001374static int
Victor Stinner488fa492011-12-12 00:01:39 +01001375unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001376{
Victor Stinner488fa492011-12-12 00:01:39 +01001377 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001378 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001379 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001380 return -1;
1381 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001382 return 0;
1383}
1384
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001385static int
1386_copy_characters(PyObject *to, Py_ssize_t to_start,
1387 PyObject *from, Py_ssize_t from_start,
1388 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001390 unsigned int from_kind, to_kind;
1391 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392
Victor Stinneree4544c2012-05-09 22:24:08 +02001393 assert(0 <= how_many);
1394 assert(0 <= from_start);
1395 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001397 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001398 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399
Victor Stinnerd3f08822012-05-29 12:57:52 +02001400 assert(PyUnicode_Check(to));
1401 assert(PyUnicode_IS_READY(to));
1402 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1403
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001404 if (how_many == 0)
1405 return 0;
1406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001408 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001410 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411
Victor Stinnerf1852262012-06-16 16:38:26 +02001412#ifdef Py_DEBUG
1413 if (!check_maxchar
1414 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1415 {
1416 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1417 Py_UCS4 ch;
1418 Py_ssize_t i;
1419 for (i=0; i < how_many; i++) {
1420 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1421 assert(ch <= to_maxchar);
1422 }
1423 }
1424#endif
1425
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001426 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001427 if (check_maxchar
1428 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1429 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001430 /* Writing Latin-1 characters into an ASCII string requires to
1431 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001432 Py_UCS4 max_char;
1433 max_char = ucs1lib_find_max_char(from_data,
1434 (Py_UCS1*)from_data + how_many);
1435 if (max_char >= 128)
1436 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001437 }
Christian Heimesf051e432016-09-13 20:22:02 +02001438 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001439 (char*)from_data + from_kind * from_start,
1440 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001442 else if (from_kind == PyUnicode_1BYTE_KIND
1443 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001444 {
1445 _PyUnicode_CONVERT_BYTES(
1446 Py_UCS1, Py_UCS2,
1447 PyUnicode_1BYTE_DATA(from) + from_start,
1448 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1449 PyUnicode_2BYTE_DATA(to) + to_start
1450 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001451 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001452 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001453 && to_kind == PyUnicode_4BYTE_KIND)
1454 {
1455 _PyUnicode_CONVERT_BYTES(
1456 Py_UCS1, Py_UCS4,
1457 PyUnicode_1BYTE_DATA(from) + from_start,
1458 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1459 PyUnicode_4BYTE_DATA(to) + to_start
1460 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001461 }
1462 else if (from_kind == PyUnicode_2BYTE_KIND
1463 && to_kind == PyUnicode_4BYTE_KIND)
1464 {
1465 _PyUnicode_CONVERT_BYTES(
1466 Py_UCS2, Py_UCS4,
1467 PyUnicode_2BYTE_DATA(from) + from_start,
1468 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1469 PyUnicode_4BYTE_DATA(to) + to_start
1470 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001471 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001472 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001473 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1474
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001475 if (!check_maxchar) {
1476 if (from_kind == PyUnicode_2BYTE_KIND
1477 && to_kind == PyUnicode_1BYTE_KIND)
1478 {
1479 _PyUnicode_CONVERT_BYTES(
1480 Py_UCS2, Py_UCS1,
1481 PyUnicode_2BYTE_DATA(from) + from_start,
1482 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1483 PyUnicode_1BYTE_DATA(to) + to_start
1484 );
1485 }
1486 else if (from_kind == PyUnicode_4BYTE_KIND
1487 && to_kind == PyUnicode_1BYTE_KIND)
1488 {
1489 _PyUnicode_CONVERT_BYTES(
1490 Py_UCS4, Py_UCS1,
1491 PyUnicode_4BYTE_DATA(from) + from_start,
1492 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1493 PyUnicode_1BYTE_DATA(to) + to_start
1494 );
1495 }
1496 else if (from_kind == PyUnicode_4BYTE_KIND
1497 && to_kind == PyUnicode_2BYTE_KIND)
1498 {
1499 _PyUnicode_CONVERT_BYTES(
1500 Py_UCS4, Py_UCS2,
1501 PyUnicode_4BYTE_DATA(from) + from_start,
1502 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1503 PyUnicode_2BYTE_DATA(to) + to_start
1504 );
1505 }
1506 else {
1507 assert(0);
1508 return -1;
1509 }
1510 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001511 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001513 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001514 Py_ssize_t i;
1515
Victor Stinnera0702ab2011-09-29 14:14:38 +02001516 for (i=0; i < how_many; i++) {
1517 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001518 if (ch > to_maxchar)
1519 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001520 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1521 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001522 }
1523 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524 return 0;
1525}
1526
Victor Stinnerd3f08822012-05-29 12:57:52 +02001527void
1528_PyUnicode_FastCopyCharacters(
1529 PyObject *to, Py_ssize_t to_start,
1530 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001531{
1532 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1533}
1534
1535Py_ssize_t
1536PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1537 PyObject *from, Py_ssize_t from_start,
1538 Py_ssize_t how_many)
1539{
1540 int err;
1541
1542 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1543 PyErr_BadInternalCall();
1544 return -1;
1545 }
1546
Benjamin Petersonbac79492012-01-14 13:34:47 -05001547 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001549 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001550 return -1;
1551
Victor Stinnerd3f08822012-05-29 12:57:52 +02001552 if (from_start < 0) {
1553 PyErr_SetString(PyExc_IndexError, "string index out of range");
1554 return -1;
1555 }
1556 if (to_start < 0) {
1557 PyErr_SetString(PyExc_IndexError, "string index out of range");
1558 return -1;
1559 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001560 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1561 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1562 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001563 "Cannot write %zi characters at %zi "
1564 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 how_many, to_start, PyUnicode_GET_LENGTH(to));
1566 return -1;
1567 }
1568
1569 if (how_many == 0)
1570 return 0;
1571
Victor Stinner488fa492011-12-12 00:01:39 +01001572 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001573 return -1;
1574
1575 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1576 if (err) {
1577 PyErr_Format(PyExc_SystemError,
1578 "Cannot copy %s characters "
1579 "into a string of %s characters",
1580 unicode_kind_name(from),
1581 unicode_kind_name(to));
1582 return -1;
1583 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001584 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585}
1586
Victor Stinner17222162011-09-28 22:15:37 +02001587/* Find the maximum code point and count the number of surrogate pairs so a
1588 correct string length can be computed before converting a string to UCS4.
1589 This function counts single surrogates as a character and not as a pair.
1590
1591 Return 0 on success, or -1 on error. */
1592static int
1593find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1594 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595{
1596 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001597 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001598
Victor Stinnerc53be962011-10-02 21:33:54 +02001599 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001600 *num_surrogates = 0;
1601 *maxchar = 0;
1602
1603 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001605 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1606 && (iter+1) < end
1607 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1608 {
1609 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1610 ++(*num_surrogates);
1611 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001612 }
1613 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001615 {
1616 ch = *iter;
1617 iter++;
1618 }
1619 if (ch > *maxchar) {
1620 *maxchar = ch;
1621 if (*maxchar > MAX_UNICODE) {
1622 PyErr_Format(PyExc_ValueError,
1623 "character U+%x is not in range [U+0000; U+10ffff]",
1624 ch);
1625 return -1;
1626 }
1627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001628 }
1629 return 0;
1630}
1631
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001632int
1633_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001634{
1635 wchar_t *end;
1636 Py_UCS4 maxchar = 0;
1637 Py_ssize_t num_surrogates;
1638#if SIZEOF_WCHAR_T == 2
1639 Py_ssize_t length_wo_surrogates;
1640#endif
1641
Georg Brandl7597add2011-10-05 16:36:47 +02001642 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001643 strings were created using _PyObject_New() and where no canonical
1644 representation (the str field) has been set yet aka strings
1645 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001646 assert(_PyUnicode_CHECK(unicode));
1647 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001649 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001650 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001651 /* Actually, it should neither be interned nor be anything else: */
1652 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001654 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001655 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001656 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658
1659 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001660 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1661 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662 PyErr_NoMemory();
1663 return -1;
1664 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001665 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 _PyUnicode_WSTR(unicode), end,
1667 PyUnicode_1BYTE_DATA(unicode));
1668 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1669 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1670 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1671 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001672 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001673 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001674 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 }
1676 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001677 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001678 _PyUnicode_UTF8(unicode) = NULL;
1679 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001680 }
1681 PyObject_FREE(_PyUnicode_WSTR(unicode));
1682 _PyUnicode_WSTR(unicode) = NULL;
1683 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1684 }
1685 /* In this case we might have to convert down from 4-byte native
1686 wchar_t to 2-byte unicode. */
1687 else if (maxchar < 65536) {
1688 assert(num_surrogates == 0 &&
1689 "FindMaxCharAndNumSurrogatePairs() messed up");
1690
Victor Stinner506f5922011-09-28 22:34:18 +02001691#if SIZEOF_WCHAR_T == 2
1692 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001693 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001694 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1695 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1696 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001697 _PyUnicode_UTF8(unicode) = NULL;
1698 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001699#else
1700 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001701 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001702 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001703 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001704 PyErr_NoMemory();
1705 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706 }
Victor Stinner506f5922011-09-28 22:34:18 +02001707 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1708 _PyUnicode_WSTR(unicode), end,
1709 PyUnicode_2BYTE_DATA(unicode));
1710 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1711 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1712 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001713 _PyUnicode_UTF8(unicode) = NULL;
1714 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001715 PyObject_FREE(_PyUnicode_WSTR(unicode));
1716 _PyUnicode_WSTR(unicode) = NULL;
1717 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1718#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 }
1720 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1721 else {
1722#if SIZEOF_WCHAR_T == 2
1723 /* in case the native representation is 2-bytes, we need to allocate a
1724 new normalized 4-byte version. */
1725 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001726 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1727 PyErr_NoMemory();
1728 return -1;
1729 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001730 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1731 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001732 PyErr_NoMemory();
1733 return -1;
1734 }
1735 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1736 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001737 _PyUnicode_UTF8(unicode) = NULL;
1738 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001739 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1740 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001741 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742 PyObject_FREE(_PyUnicode_WSTR(unicode));
1743 _PyUnicode_WSTR(unicode) = NULL;
1744 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1745#else
1746 assert(num_surrogates == 0);
1747
Victor Stinnerc3c74152011-10-02 20:39:55 +02001748 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001750 _PyUnicode_UTF8(unicode) = NULL;
1751 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1753#endif
1754 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1755 }
1756 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001757 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 return 0;
1759}
1760
Alexander Belopolsky40018472011-02-26 01:02:56 +00001761static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001762unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763{
Walter Dörwald16807132007-05-25 13:52:07 +00001764 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001765 case SSTATE_NOT_INTERNED:
1766 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001767
Benjamin Peterson29060642009-01-31 22:14:21 +00001768 case SSTATE_INTERNED_MORTAL:
1769 /* revive dead object temporarily for DelItem */
1770 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001771 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001772 Py_FatalError(
1773 "deletion of interned string failed");
1774 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001775
Benjamin Peterson29060642009-01-31 22:14:21 +00001776 case SSTATE_INTERNED_IMMORTAL:
1777 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001778
Benjamin Peterson29060642009-01-31 22:14:21 +00001779 default:
1780 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001781 }
1782
Victor Stinner03490912011-10-03 23:45:12 +02001783 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001785 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001786 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001787 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1788 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001790 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791}
1792
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001793#ifdef Py_DEBUG
1794static int
1795unicode_is_singleton(PyObject *unicode)
1796{
1797 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1798 if (unicode == unicode_empty)
1799 return 1;
1800 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1801 {
1802 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1803 if (ch < 256 && unicode_latin1[ch] == unicode)
1804 return 1;
1805 }
1806 return 0;
1807}
1808#endif
1809
Alexander Belopolsky40018472011-02-26 01:02:56 +00001810static int
Victor Stinner488fa492011-12-12 00:01:39 +01001811unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001812{
Victor Stinner488fa492011-12-12 00:01:39 +01001813 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001814 if (Py_REFCNT(unicode) != 1)
1815 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001816 if (_PyUnicode_HASH(unicode) != -1)
1817 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001818 if (PyUnicode_CHECK_INTERNED(unicode))
1819 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001820 if (!PyUnicode_CheckExact(unicode))
1821 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001822#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001823 /* singleton refcount is greater than 1 */
1824 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001825#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001826 return 1;
1827}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001828
Victor Stinnerfe226c02011-10-03 03:52:20 +02001829static int
1830unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1831{
1832 PyObject *unicode;
1833 Py_ssize_t old_length;
1834
1835 assert(p_unicode != NULL);
1836 unicode = *p_unicode;
1837
1838 assert(unicode != NULL);
1839 assert(PyUnicode_Check(unicode));
1840 assert(0 <= length);
1841
Victor Stinner910337b2011-10-03 03:20:16 +02001842 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001843 old_length = PyUnicode_WSTR_LENGTH(unicode);
1844 else
1845 old_length = PyUnicode_GET_LENGTH(unicode);
1846 if (old_length == length)
1847 return 0;
1848
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001849 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001850 _Py_INCREF_UNICODE_EMPTY();
1851 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001852 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001853 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001854 return 0;
1855 }
1856
Victor Stinner488fa492011-12-12 00:01:39 +01001857 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001858 PyObject *copy = resize_copy(unicode, length);
1859 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001860 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001861 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001862 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001863 }
1864
Victor Stinnerfe226c02011-10-03 03:52:20 +02001865 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001866 PyObject *new_unicode = resize_compact(unicode, length);
1867 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001868 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001869 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001870 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001871 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001872 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001873}
1874
Alexander Belopolsky40018472011-02-26 01:02:56 +00001875int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001876PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001877{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001878 PyObject *unicode;
1879 if (p_unicode == NULL) {
1880 PyErr_BadInternalCall();
1881 return -1;
1882 }
1883 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001884 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001885 {
1886 PyErr_BadInternalCall();
1887 return -1;
1888 }
1889 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001890}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001891
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001892/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001893
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001894 WARNING: The function doesn't copy the terminating null character and
1895 doesn't check the maximum character (may write a latin1 character in an
1896 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001897static void
1898unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1899 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001900{
1901 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1902 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001903 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001904
1905 switch (kind) {
1906 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001907 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001908#ifdef Py_DEBUG
1909 if (PyUnicode_IS_ASCII(unicode)) {
1910 Py_UCS4 maxchar = ucs1lib_find_max_char(
1911 (const Py_UCS1*)str,
1912 (const Py_UCS1*)str + len);
1913 assert(maxchar < 128);
1914 }
1915#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001916 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001917 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001918 }
1919 case PyUnicode_2BYTE_KIND: {
1920 Py_UCS2 *start = (Py_UCS2 *)data + index;
1921 Py_UCS2 *ucs2 = start;
1922 assert(index <= PyUnicode_GET_LENGTH(unicode));
1923
Victor Stinner184252a2012-06-16 02:57:41 +02001924 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001925 *ucs2 = (Py_UCS2)*str;
1926
1927 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001928 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001929 }
1930 default: {
1931 Py_UCS4 *start = (Py_UCS4 *)data + index;
1932 Py_UCS4 *ucs4 = start;
1933 assert(kind == PyUnicode_4BYTE_KIND);
1934 assert(index <= PyUnicode_GET_LENGTH(unicode));
1935
Victor Stinner184252a2012-06-16 02:57:41 +02001936 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001937 *ucs4 = (Py_UCS4)*str;
1938
1939 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001940 }
1941 }
1942}
1943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001944static PyObject*
1945get_latin1_char(unsigned char ch)
1946{
Victor Stinnera464fc12011-10-02 20:39:30 +02001947 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001949 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 if (!unicode)
1951 return NULL;
1952 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001953 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 unicode_latin1[ch] = unicode;
1955 }
1956 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001957 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958}
1959
Victor Stinner985a82a2014-01-03 12:53:47 +01001960static PyObject*
1961unicode_char(Py_UCS4 ch)
1962{
1963 PyObject *unicode;
1964
1965 assert(ch <= MAX_UNICODE);
1966
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001967 if (ch < 256)
1968 return get_latin1_char(ch);
1969
Victor Stinner985a82a2014-01-03 12:53:47 +01001970 unicode = PyUnicode_New(1, ch);
1971 if (unicode == NULL)
1972 return NULL;
1973 switch (PyUnicode_KIND(unicode)) {
1974 case PyUnicode_1BYTE_KIND:
1975 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1976 break;
1977 case PyUnicode_2BYTE_KIND:
1978 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1979 break;
1980 default:
1981 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1982 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1983 }
1984 assert(_PyUnicode_CheckConsistency(unicode, 1));
1985 return unicode;
1986}
1987
Alexander Belopolsky40018472011-02-26 01:02:56 +00001988PyObject *
1989PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001991 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 Py_UCS4 maxchar = 0;
1993 Py_ssize_t num_surrogates;
1994
1995 if (u == NULL)
1996 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001998 /* If the Unicode data is known at construction time, we can apply
1999 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002002 if (size == 0)
2003 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 /* Single character Unicode objects in the Latin-1 range are
2006 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002007 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 return get_latin1_char((unsigned char)*u);
2009
2010 /* If not empty and not single character, copy the Unicode data
2011 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002012 if (find_maxchar_surrogates(u, u + size,
2013 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 return NULL;
2015
Victor Stinner8faf8212011-12-08 22:14:11 +01002016 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017 if (!unicode)
2018 return NULL;
2019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002020 switch (PyUnicode_KIND(unicode)) {
2021 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002022 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002023 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2024 break;
2025 case PyUnicode_2BYTE_KIND:
2026#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002027 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002029 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2031#endif
2032 break;
2033 case PyUnicode_4BYTE_KIND:
2034#if SIZEOF_WCHAR_T == 2
2035 /* This is the only case which has to process surrogates, thus
2036 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002037 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002038#else
2039 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002040 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041#endif
2042 break;
2043 default:
2044 assert(0 && "Impossible state");
2045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002047 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048}
2049
Alexander Belopolsky40018472011-02-26 01:02:56 +00002050PyObject *
2051PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002052{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002053 if (size < 0) {
2054 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002055 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002056 return NULL;
2057 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002058 if (u != NULL)
2059 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2060 else
2061 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002062}
2063
Alexander Belopolsky40018472011-02-26 01:02:56 +00002064PyObject *
2065PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002066{
2067 size_t size = strlen(u);
2068 if (size > PY_SSIZE_T_MAX) {
2069 PyErr_SetString(PyExc_OverflowError, "input too long");
2070 return NULL;
2071 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002072 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002073}
2074
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002075PyObject *
2076_PyUnicode_FromId(_Py_Identifier *id)
2077{
2078 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002079 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2080 strlen(id->string),
2081 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002082 if (!id->object)
2083 return NULL;
2084 PyUnicode_InternInPlace(&id->object);
2085 assert(!id->next);
2086 id->next = static_strings;
2087 static_strings = id;
2088 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002089 return id->object;
2090}
2091
2092void
2093_PyUnicode_ClearStaticStrings()
2094{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002095 _Py_Identifier *tmp, *s = static_strings;
2096 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002097 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002098 tmp = s->next;
2099 s->next = NULL;
2100 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002101 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002102 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002103}
2104
Benjamin Peterson0df54292012-03-26 14:50:32 -04002105/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002106
Victor Stinnerd3f08822012-05-29 12:57:52 +02002107PyObject*
2108_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002109{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002110 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002111 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002112 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002113#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002114 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002115#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002116 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002117 }
Victor Stinner785938e2011-12-11 20:09:03 +01002118 unicode = PyUnicode_New(size, 127);
2119 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002120 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002121 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2122 assert(_PyUnicode_CheckConsistency(unicode, 1));
2123 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002124}
2125
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002126static Py_UCS4
2127kind_maxchar_limit(unsigned int kind)
2128{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002129 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002130 case PyUnicode_1BYTE_KIND:
2131 return 0x80;
2132 case PyUnicode_2BYTE_KIND:
2133 return 0x100;
2134 case PyUnicode_4BYTE_KIND:
2135 return 0x10000;
2136 default:
2137 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002138 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002139 }
2140}
2141
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002142static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002143align_maxchar(Py_UCS4 maxchar)
2144{
2145 if (maxchar <= 127)
2146 return 127;
2147 else if (maxchar <= 255)
2148 return 255;
2149 else if (maxchar <= 65535)
2150 return 65535;
2151 else
2152 return MAX_UNICODE;
2153}
2154
Victor Stinner702c7342011-10-05 13:50:52 +02002155static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002156_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002157{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002158 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002159 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002160
Serhiy Storchaka678db842013-01-26 12:16:36 +02002161 if (size == 0)
2162 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002163 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002164 if (size == 1)
2165 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002166
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002167 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002168 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 if (!res)
2170 return NULL;
2171 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002172 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002174}
2175
Victor Stinnere57b1c02011-09-28 22:20:48 +02002176static PyObject*
2177_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178{
2179 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002180 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002181
Serhiy Storchaka678db842013-01-26 12:16:36 +02002182 if (size == 0)
2183 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002184 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002185 if (size == 1)
2186 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002187
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002188 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002189 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190 if (!res)
2191 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002192 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002194 else {
2195 _PyUnicode_CONVERT_BYTES(
2196 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2197 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002198 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199 return res;
2200}
2201
Victor Stinnere57b1c02011-09-28 22:20:48 +02002202static PyObject*
2203_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204{
2205 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002206 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002207
Serhiy Storchaka678db842013-01-26 12:16:36 +02002208 if (size == 0)
2209 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002210 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002211 if (size == 1)
2212 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002213
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002214 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002215 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 if (!res)
2217 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002218 if (max_char < 256)
2219 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2220 PyUnicode_1BYTE_DATA(res));
2221 else if (max_char < 0x10000)
2222 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2223 PyUnicode_2BYTE_DATA(res));
2224 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002226 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 return res;
2228}
2229
2230PyObject*
2231PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2232{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002233 if (size < 0) {
2234 PyErr_SetString(PyExc_ValueError, "size must be positive");
2235 return NULL;
2236 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002237 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002239 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002241 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002243 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002244 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002245 PyErr_SetString(PyExc_SystemError, "invalid kind");
2246 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248}
2249
Victor Stinnerece58de2012-04-23 23:36:38 +02002250Py_UCS4
2251_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2252{
2253 enum PyUnicode_Kind kind;
2254 void *startptr, *endptr;
2255
2256 assert(PyUnicode_IS_READY(unicode));
2257 assert(0 <= start);
2258 assert(end <= PyUnicode_GET_LENGTH(unicode));
2259 assert(start <= end);
2260
2261 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2262 return PyUnicode_MAX_CHAR_VALUE(unicode);
2263
2264 if (start == end)
2265 return 127;
2266
Victor Stinner94d558b2012-04-27 22:26:58 +02002267 if (PyUnicode_IS_ASCII(unicode))
2268 return 127;
2269
Victor Stinnerece58de2012-04-23 23:36:38 +02002270 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002271 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002272 endptr = (char *)startptr + end * kind;
2273 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002274 switch(kind) {
2275 case PyUnicode_1BYTE_KIND:
2276 return ucs1lib_find_max_char(startptr, endptr);
2277 case PyUnicode_2BYTE_KIND:
2278 return ucs2lib_find_max_char(startptr, endptr);
2279 case PyUnicode_4BYTE_KIND:
2280 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002281 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002282 assert(0);
2283 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002284 }
2285}
2286
Victor Stinner25a4b292011-10-06 12:31:55 +02002287/* Ensure that a string uses the most efficient storage, if it is not the
2288 case: create a new string with of the right kind. Write NULL into *p_unicode
2289 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002290static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002291unicode_adjust_maxchar(PyObject **p_unicode)
2292{
2293 PyObject *unicode, *copy;
2294 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002295 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002296 unsigned int kind;
2297
2298 assert(p_unicode != NULL);
2299 unicode = *p_unicode;
2300 assert(PyUnicode_IS_READY(unicode));
2301 if (PyUnicode_IS_ASCII(unicode))
2302 return;
2303
2304 len = PyUnicode_GET_LENGTH(unicode);
2305 kind = PyUnicode_KIND(unicode);
2306 if (kind == PyUnicode_1BYTE_KIND) {
2307 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002308 max_char = ucs1lib_find_max_char(u, u + len);
2309 if (max_char >= 128)
2310 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002311 }
2312 else if (kind == PyUnicode_2BYTE_KIND) {
2313 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002314 max_char = ucs2lib_find_max_char(u, u + len);
2315 if (max_char >= 256)
2316 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002317 }
2318 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002319 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002320 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002321 max_char = ucs4lib_find_max_char(u, u + len);
2322 if (max_char >= 0x10000)
2323 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002324 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002325 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002326 if (copy != NULL)
2327 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002328 Py_DECREF(unicode);
2329 *p_unicode = copy;
2330}
2331
Victor Stinner034f6cf2011-09-30 02:26:44 +02002332PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002333_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002334{
Victor Stinner87af4f22011-11-21 23:03:47 +01002335 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002336 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002337
Victor Stinner034f6cf2011-09-30 02:26:44 +02002338 if (!PyUnicode_Check(unicode)) {
2339 PyErr_BadInternalCall();
2340 return NULL;
2341 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002342 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002343 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002344
Victor Stinner87af4f22011-11-21 23:03:47 +01002345 length = PyUnicode_GET_LENGTH(unicode);
2346 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002347 if (!copy)
2348 return NULL;
2349 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2350
Christian Heimesf051e432016-09-13 20:22:02 +02002351 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002352 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002353 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002354 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002355}
2356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002357
Victor Stinnerbc603d12011-10-02 01:00:40 +02002358/* Widen Unicode objects to larger buffers. Don't write terminating null
2359 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360
2361void*
2362_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2363{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002364 Py_ssize_t len;
2365 void *result;
2366 unsigned int skind;
2367
Benjamin Petersonbac79492012-01-14 13:34:47 -05002368 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002369 return NULL;
2370
2371 len = PyUnicode_GET_LENGTH(s);
2372 skind = PyUnicode_KIND(s);
2373 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002374 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002375 return NULL;
2376 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002377 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002378 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002379 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002380 if (!result)
2381 return PyErr_NoMemory();
2382 assert(skind == PyUnicode_1BYTE_KIND);
2383 _PyUnicode_CONVERT_BYTES(
2384 Py_UCS1, Py_UCS2,
2385 PyUnicode_1BYTE_DATA(s),
2386 PyUnicode_1BYTE_DATA(s) + len,
2387 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002388 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002389 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002390 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002391 if (!result)
2392 return PyErr_NoMemory();
2393 if (skind == PyUnicode_2BYTE_KIND) {
2394 _PyUnicode_CONVERT_BYTES(
2395 Py_UCS2, Py_UCS4,
2396 PyUnicode_2BYTE_DATA(s),
2397 PyUnicode_2BYTE_DATA(s) + len,
2398 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002399 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002400 else {
2401 assert(skind == PyUnicode_1BYTE_KIND);
2402 _PyUnicode_CONVERT_BYTES(
2403 Py_UCS1, Py_UCS4,
2404 PyUnicode_1BYTE_DATA(s),
2405 PyUnicode_1BYTE_DATA(s) + len,
2406 result);
2407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002409 default:
2410 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 }
Victor Stinner01698042011-10-04 00:04:26 +02002412 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 return NULL;
2414}
2415
2416static Py_UCS4*
2417as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2418 int copy_null)
2419{
2420 int kind;
2421 void *data;
2422 Py_ssize_t len, targetlen;
2423 if (PyUnicode_READY(string) == -1)
2424 return NULL;
2425 kind = PyUnicode_KIND(string);
2426 data = PyUnicode_DATA(string);
2427 len = PyUnicode_GET_LENGTH(string);
2428 targetlen = len;
2429 if (copy_null)
2430 targetlen++;
2431 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002432 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 if (!target) {
2434 PyErr_NoMemory();
2435 return NULL;
2436 }
2437 }
2438 else {
2439 if (targetsize < targetlen) {
2440 PyErr_Format(PyExc_SystemError,
2441 "string is longer than the buffer");
2442 if (copy_null && 0 < targetsize)
2443 target[0] = 0;
2444 return NULL;
2445 }
2446 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002447 if (kind == PyUnicode_1BYTE_KIND) {
2448 Py_UCS1 *start = (Py_UCS1 *) data;
2449 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002451 else if (kind == PyUnicode_2BYTE_KIND) {
2452 Py_UCS2 *start = (Py_UCS2 *) data;
2453 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2454 }
2455 else {
2456 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002457 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002458 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 if (copy_null)
2460 target[len] = 0;
2461 return target;
2462}
2463
2464Py_UCS4*
2465PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2466 int copy_null)
2467{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002468 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 PyErr_BadInternalCall();
2470 return NULL;
2471 }
2472 return as_ucs4(string, target, targetsize, copy_null);
2473}
2474
2475Py_UCS4*
2476PyUnicode_AsUCS4Copy(PyObject *string)
2477{
2478 return as_ucs4(string, NULL, 0, 1);
2479}
2480
2481#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002482
Alexander Belopolsky40018472011-02-26 01:02:56 +00002483PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002484PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002488 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002489 PyErr_BadInternalCall();
2490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 }
2492
Martin v. Löwis790465f2008-04-05 20:41:37 +00002493 if (size == -1) {
2494 size = wcslen(w);
2495 }
2496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498}
2499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002501
Victor Stinner15a11362012-10-06 23:48:20 +02002502/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002503 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2504 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2505#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002506
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002507static int
2508unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2509 Py_ssize_t width, Py_ssize_t precision)
2510{
2511 Py_ssize_t length, fill, arglen;
2512 Py_UCS4 maxchar;
2513
2514 if (PyUnicode_READY(str) == -1)
2515 return -1;
2516
2517 length = PyUnicode_GET_LENGTH(str);
2518 if ((precision == -1 || precision >= length)
2519 && width <= length)
2520 return _PyUnicodeWriter_WriteStr(writer, str);
2521
2522 if (precision != -1)
2523 length = Py_MIN(precision, length);
2524
2525 arglen = Py_MAX(length, width);
2526 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2527 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2528 else
2529 maxchar = writer->maxchar;
2530
2531 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2532 return -1;
2533
2534 if (width > length) {
2535 fill = width - length;
2536 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2537 return -1;
2538 writer->pos += fill;
2539 }
2540
2541 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2542 str, 0, length);
2543 writer->pos += length;
2544 return 0;
2545}
2546
2547static int
2548unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2549 Py_ssize_t width, Py_ssize_t precision)
2550{
2551 /* UTF-8 */
2552 Py_ssize_t length;
2553 PyObject *unicode;
2554 int res;
2555
2556 length = strlen(str);
2557 if (precision != -1)
2558 length = Py_MIN(length, precision);
2559 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2560 if (unicode == NULL)
2561 return -1;
2562
2563 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2564 Py_DECREF(unicode);
2565 return res;
2566}
2567
Victor Stinner96865452011-03-01 23:44:09 +00002568static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002569unicode_fromformat_arg(_PyUnicodeWriter *writer,
2570 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002571{
Victor Stinnere215d962012-10-06 23:03:36 +02002572 const char *p;
2573 Py_ssize_t len;
2574 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002575 Py_ssize_t width;
2576 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002577 int longflag;
2578 int longlongflag;
2579 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002580 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002581
2582 p = f;
2583 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002584 zeropad = 0;
2585 if (*f == '0') {
2586 zeropad = 1;
2587 f++;
2588 }
Victor Stinner96865452011-03-01 23:44:09 +00002589
2590 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002591 width = -1;
2592 if (Py_ISDIGIT((unsigned)*f)) {
2593 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002594 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002595 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002596 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002597 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002598 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002599 return NULL;
2600 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002601 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002602 f++;
2603 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002604 }
2605 precision = -1;
2606 if (*f == '.') {
2607 f++;
2608 if (Py_ISDIGIT((unsigned)*f)) {
2609 precision = (*f - '0');
2610 f++;
2611 while (Py_ISDIGIT((unsigned)*f)) {
2612 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2613 PyErr_SetString(PyExc_ValueError,
2614 "precision too big");
2615 return NULL;
2616 }
2617 precision = (precision * 10) + (*f - '0');
2618 f++;
2619 }
2620 }
Victor Stinner96865452011-03-01 23:44:09 +00002621 if (*f == '%') {
2622 /* "%.3%s" => f points to "3" */
2623 f--;
2624 }
2625 }
2626 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002627 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002628 f--;
2629 }
Victor Stinner96865452011-03-01 23:44:09 +00002630
2631 /* Handle %ld, %lu, %lld and %llu. */
2632 longflag = 0;
2633 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002634 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002635 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002636 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002637 longflag = 1;
2638 ++f;
2639 }
Victor Stinner96865452011-03-01 23:44:09 +00002640 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002641 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002642 longlongflag = 1;
2643 f += 2;
2644 }
Victor Stinner96865452011-03-01 23:44:09 +00002645 }
2646 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002647 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002648 size_tflag = 1;
2649 ++f;
2650 }
Victor Stinnere215d962012-10-06 23:03:36 +02002651
2652 if (f[1] == '\0')
2653 writer->overallocate = 0;
2654
2655 switch (*f) {
2656 case 'c':
2657 {
2658 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002659 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002660 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002661 "character argument not in range(0x110000)");
2662 return NULL;
2663 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002664 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002665 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002666 break;
2667 }
2668
2669 case 'i':
2670 case 'd':
2671 case 'u':
2672 case 'x':
2673 {
2674 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002675 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002676 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002677
2678 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002679 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002680 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002681 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002682 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002683 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002684 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002685 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002686 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002687 va_arg(*vargs, size_t));
2688 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002689 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002690 va_arg(*vargs, unsigned int));
2691 }
2692 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002693 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002694 }
2695 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002696 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002697 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002698 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002699 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002700 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002701 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002702 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002703 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002704 va_arg(*vargs, Py_ssize_t));
2705 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002706 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002707 va_arg(*vargs, int));
2708 }
2709 assert(len >= 0);
2710
Victor Stinnere215d962012-10-06 23:03:36 +02002711 if (precision < len)
2712 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002713
2714 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002715 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2716 return NULL;
2717
Victor Stinnere215d962012-10-06 23:03:36 +02002718 if (width > precision) {
2719 Py_UCS4 fillchar;
2720 fill = width - precision;
2721 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002722 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2723 return NULL;
2724 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002725 }
Victor Stinner15a11362012-10-06 23:48:20 +02002726 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002727 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002728 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2729 return NULL;
2730 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002731 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002732
Victor Stinner4a587072013-11-19 12:54:53 +01002733 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2734 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002735 break;
2736 }
2737
2738 case 'p':
2739 {
2740 char number[MAX_LONG_LONG_CHARS];
2741
2742 len = sprintf(number, "%p", va_arg(*vargs, void*));
2743 assert(len >= 0);
2744
2745 /* %p is ill-defined: ensure leading 0x. */
2746 if (number[1] == 'X')
2747 number[1] = 'x';
2748 else if (number[1] != 'x') {
2749 memmove(number + 2, number,
2750 strlen(number) + 1);
2751 number[0] = '0';
2752 number[1] = 'x';
2753 len += 2;
2754 }
2755
Victor Stinner4a587072013-11-19 12:54:53 +01002756 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002757 return NULL;
2758 break;
2759 }
2760
2761 case 's':
2762 {
2763 /* UTF-8 */
2764 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002765 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002766 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002767 break;
2768 }
2769
2770 case 'U':
2771 {
2772 PyObject *obj = va_arg(*vargs, PyObject *);
2773 assert(obj && _PyUnicode_CHECK(obj));
2774
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002775 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002776 return NULL;
2777 break;
2778 }
2779
2780 case 'V':
2781 {
2782 PyObject *obj = va_arg(*vargs, PyObject *);
2783 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002784 if (obj) {
2785 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002786 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002787 return NULL;
2788 }
2789 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 assert(str != NULL);
2791 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002792 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002793 }
2794 break;
2795 }
2796
2797 case 'S':
2798 {
2799 PyObject *obj = va_arg(*vargs, PyObject *);
2800 PyObject *str;
2801 assert(obj);
2802 str = PyObject_Str(obj);
2803 if (!str)
2804 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002805 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002806 Py_DECREF(str);
2807 return NULL;
2808 }
2809 Py_DECREF(str);
2810 break;
2811 }
2812
2813 case 'R':
2814 {
2815 PyObject *obj = va_arg(*vargs, PyObject *);
2816 PyObject *repr;
2817 assert(obj);
2818 repr = PyObject_Repr(obj);
2819 if (!repr)
2820 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002821 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002822 Py_DECREF(repr);
2823 return NULL;
2824 }
2825 Py_DECREF(repr);
2826 break;
2827 }
2828
2829 case 'A':
2830 {
2831 PyObject *obj = va_arg(*vargs, PyObject *);
2832 PyObject *ascii;
2833 assert(obj);
2834 ascii = PyObject_ASCII(obj);
2835 if (!ascii)
2836 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002837 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002838 Py_DECREF(ascii);
2839 return NULL;
2840 }
2841 Py_DECREF(ascii);
2842 break;
2843 }
2844
2845 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002846 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002847 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002848 break;
2849
2850 default:
2851 /* if we stumble upon an unknown formatting code, copy the rest
2852 of the format string to the output string. (we cannot just
2853 skip the code, since there's no way to know what's in the
2854 argument list) */
2855 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002856 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002857 return NULL;
2858 f = p+len;
2859 return f;
2860 }
2861
2862 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002863 return f;
2864}
2865
Walter Dörwaldd2034312007-05-18 16:29:38 +00002866PyObject *
2867PyUnicode_FromFormatV(const char *format, va_list vargs)
2868{
Victor Stinnere215d962012-10-06 23:03:36 +02002869 va_list vargs2;
2870 const char *f;
2871 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002872
Victor Stinner8f674cc2013-04-17 23:02:17 +02002873 _PyUnicodeWriter_Init(&writer);
2874 writer.min_length = strlen(format) + 100;
2875 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002876
Benjamin Peterson0c212142016-09-20 20:39:33 -07002877 // Copy varags to be able to pass a reference to a subfunction.
2878 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002879
2880 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002881 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002882 f = unicode_fromformat_arg(&writer, f, &vargs2);
2883 if (f == NULL)
2884 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002885 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002886 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002887 const char *p;
2888 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002889
Victor Stinnere215d962012-10-06 23:03:36 +02002890 p = f;
2891 do
2892 {
2893 if ((unsigned char)*p > 127) {
Christian Heimes2f2fee12016-09-21 11:37:27 +02002894 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002895 PyErr_Format(PyExc_ValueError,
2896 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2897 "string, got a non-ASCII byte: 0x%02x",
2898 (unsigned char)*p);
2899 return NULL;
2900 }
2901 p++;
2902 }
2903 while (*p != '\0' && *p != '%');
2904 len = p - f;
2905
2906 if (*p == '\0')
2907 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002908
2909 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002910 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002911
2912 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002913 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002914 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002915 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002916 return _PyUnicodeWriter_Finish(&writer);
2917
2918 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002919 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002920 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002921 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002922}
2923
Walter Dörwaldd2034312007-05-18 16:29:38 +00002924PyObject *
2925PyUnicode_FromFormat(const char *format, ...)
2926{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002927 PyObject* ret;
2928 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002929
2930#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002931 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002932#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002933 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002934#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002935 ret = PyUnicode_FromFormatV(format, vargs);
2936 va_end(vargs);
2937 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002938}
2939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002940#ifdef HAVE_WCHAR_H
2941
Victor Stinner5593d8a2010-10-02 11:11:27 +00002942/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2943 convert a Unicode object to a wide character string.
2944
Victor Stinnerd88d9832011-09-06 02:00:05 +02002945 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002946 character) required to convert the unicode object. Ignore size argument.
2947
Victor Stinnerd88d9832011-09-06 02:00:05 +02002948 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002949 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002950 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002951static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002952unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002953 wchar_t *w,
2954 Py_ssize_t size)
2955{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002956 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002957 const wchar_t *wstr;
2958
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002959 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002960 if (wstr == NULL)
2961 return -1;
2962
Victor Stinner5593d8a2010-10-02 11:11:27 +00002963 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002964 if (size > res)
2965 size = res + 1;
2966 else
2967 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002968 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002969 return res;
2970 }
2971 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002972 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002973}
2974
2975Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002976PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002977 wchar_t *w,
2978 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979{
2980 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002981 PyErr_BadInternalCall();
2982 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002984 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985}
2986
Victor Stinner137c34c2010-09-29 10:25:54 +00002987wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002988PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002989 Py_ssize_t *size)
2990{
2991 wchar_t* buffer;
2992 Py_ssize_t buflen;
2993
2994 if (unicode == NULL) {
2995 PyErr_BadInternalCall();
2996 return NULL;
2997 }
2998
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002999 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003000 if (buflen == -1)
3001 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003002 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003003 if (buffer == NULL) {
3004 PyErr_NoMemory();
3005 return NULL;
3006 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003007 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003008 if (buflen == -1) {
3009 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003010 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003011 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003012 if (size != NULL)
3013 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003014 return buffer;
3015}
3016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003017#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018
Alexander Belopolsky40018472011-02-26 01:02:56 +00003019PyObject *
3020PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003021{
Victor Stinner8faf8212011-12-08 22:14:11 +01003022 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003023 PyErr_SetString(PyExc_ValueError,
3024 "chr() arg not in range(0x110000)");
3025 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003026 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003027
Victor Stinner985a82a2014-01-03 12:53:47 +01003028 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003029}
3030
Alexander Belopolsky40018472011-02-26 01:02:56 +00003031PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003032PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003034 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003035 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003036 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003037 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003038 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003039 Py_INCREF(obj);
3040 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003041 }
3042 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003043 /* For a Unicode subtype that's not a Unicode object,
3044 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003045 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003046 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003047 PyErr_Format(PyExc_TypeError,
3048 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003049 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003050 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003051}
3052
Alexander Belopolsky40018472011-02-26 01:02:56 +00003053PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003054PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003055 const char *encoding,
3056 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003057{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003058 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003059 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003060
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003062 PyErr_BadInternalCall();
3063 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003065
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003066 /* Decoding bytes objects is the most common case and should be fast */
3067 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003068 if (PyBytes_GET_SIZE(obj) == 0)
3069 _Py_RETURN_UNICODE_EMPTY();
3070 v = PyUnicode_Decode(
3071 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3072 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003073 return v;
3074 }
3075
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003076 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 PyErr_SetString(PyExc_TypeError,
3078 "decoding str is not supported");
3079 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003080 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003081
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003082 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3083 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3084 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003085 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003086 Py_TYPE(obj)->tp_name);
3087 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003088 }
Tim Petersced69f82003-09-16 20:30:58 +00003089
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003090 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003091 PyBuffer_Release(&buffer);
3092 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003094
Serhiy Storchaka05997252013-01-26 12:14:02 +02003095 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003096 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003097 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098}
3099
Victor Stinner942889a2016-09-05 15:40:10 -07003100/* Normalize an encoding name: C implementation of
3101 encodings.normalize_encoding(). Return 1 on success, or 0 on error (encoding
3102 is longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003103int
3104_Py_normalize_encoding(const char *encoding,
3105 char *lower,
3106 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003108 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003109 char *l;
3110 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003111 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003112
Victor Stinner942889a2016-09-05 15:40:10 -07003113 assert(encoding != NULL);
3114
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003115 e = encoding;
3116 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003117 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003118 punct = 0;
3119 while (1) {
3120 char c = *e;
3121 if (c == 0) {
3122 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003123 }
Victor Stinner942889a2016-09-05 15:40:10 -07003124
3125 if (Py_ISALNUM(c) || c == '.') {
3126 if (punct && l != lower) {
3127 if (l == l_end) {
3128 return 0;
3129 }
3130 *l++ = '_';
3131 }
3132 punct = 0;
3133
3134 if (l == l_end) {
3135 return 0;
3136 }
3137 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003138 }
3139 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003140 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003141 }
Victor Stinner942889a2016-09-05 15:40:10 -07003142
3143 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003144 }
3145 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003146 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003147}
3148
Alexander Belopolsky40018472011-02-26 01:02:56 +00003149PyObject *
3150PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003151 Py_ssize_t size,
3152 const char *encoding,
3153 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003154{
3155 PyObject *buffer = NULL, *unicode;
3156 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003157 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3158
3159 if (encoding == NULL) {
3160 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3161 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003162
Fred Drakee4315f52000-05-09 19:53:39 +00003163 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003164 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3165 char *lower = buflower;
3166
3167 /* Fast paths */
3168 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3169 lower += 3;
3170 if (*lower == '_') {
3171 /* Match "utf8" and "utf_8" */
3172 lower++;
3173 }
3174
3175 if (lower[0] == '8' && lower[1] == 0) {
3176 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3177 }
3178 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3179 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3180 }
3181 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3182 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3183 }
3184 }
3185 else {
3186 if (strcmp(lower, "ascii") == 0
3187 || strcmp(lower, "us_ascii") == 0) {
3188 return PyUnicode_DecodeASCII(s, size, errors);
3189 }
Steve Dowercc16be82016-09-08 10:35:16 -07003190 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003191 else if (strcmp(lower, "mbcs") == 0) {
3192 return PyUnicode_DecodeMBCS(s, size, errors);
3193 }
3194 #endif
3195 else if (strcmp(lower, "latin1") == 0
3196 || strcmp(lower, "latin_1") == 0
3197 || strcmp(lower, "iso_8859_1") == 0
3198 || strcmp(lower, "iso8859_1") == 0) {
3199 return PyUnicode_DecodeLatin1(s, size, errors);
3200 }
3201 }
Victor Stinner37296e82010-06-10 13:36:23 +00003202 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203
3204 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003205 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003206 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003207 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003208 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 if (buffer == NULL)
3210 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003211 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 if (unicode == NULL)
3213 goto onError;
3214 if (!PyUnicode_Check(unicode)) {
3215 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003216 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3217 "use codecs.decode() to decode to arbitrary types",
3218 encoding,
3219 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 Py_DECREF(unicode);
3221 goto onError;
3222 }
3223 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003224 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003225
Benjamin Peterson29060642009-01-31 22:14:21 +00003226 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227 Py_XDECREF(buffer);
3228 return NULL;
3229}
3230
Alexander Belopolsky40018472011-02-26 01:02:56 +00003231PyObject *
3232PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003233 const char *encoding,
3234 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003235{
3236 PyObject *v;
3237
3238 if (!PyUnicode_Check(unicode)) {
3239 PyErr_BadArgument();
3240 goto onError;
3241 }
3242
3243 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003244 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003245
3246 /* Decode via the codec registry */
3247 v = PyCodec_Decode(unicode, encoding, errors);
3248 if (v == NULL)
3249 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003250 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003251
Benjamin Peterson29060642009-01-31 22:14:21 +00003252 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003253 return NULL;
3254}
3255
Alexander Belopolsky40018472011-02-26 01:02:56 +00003256PyObject *
3257PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003258 const char *encoding,
3259 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003260{
3261 PyObject *v;
3262
3263 if (!PyUnicode_Check(unicode)) {
3264 PyErr_BadArgument();
3265 goto onError;
3266 }
3267
3268 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003269 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003270
3271 /* Decode via the codec registry */
3272 v = PyCodec_Decode(unicode, encoding, errors);
3273 if (v == NULL)
3274 goto onError;
3275 if (!PyUnicode_Check(v)) {
3276 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003277 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3278 "use codecs.decode() to decode to arbitrary types",
3279 encoding,
3280 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003281 Py_DECREF(v);
3282 goto onError;
3283 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003284 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003285
Benjamin Peterson29060642009-01-31 22:14:21 +00003286 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003287 return NULL;
3288}
3289
Alexander Belopolsky40018472011-02-26 01:02:56 +00003290PyObject *
3291PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003292 Py_ssize_t size,
3293 const char *encoding,
3294 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003295{
3296 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003297
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 unicode = PyUnicode_FromUnicode(s, size);
3299 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003300 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3302 Py_DECREF(unicode);
3303 return v;
3304}
3305
Alexander Belopolsky40018472011-02-26 01:02:56 +00003306PyObject *
3307PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003308 const char *encoding,
3309 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003310{
3311 PyObject *v;
3312
3313 if (!PyUnicode_Check(unicode)) {
3314 PyErr_BadArgument();
3315 goto onError;
3316 }
3317
3318 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003319 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003320
3321 /* Encode via the codec registry */
3322 v = PyCodec_Encode(unicode, encoding, errors);
3323 if (v == NULL)
3324 goto onError;
3325 return v;
3326
Benjamin Peterson29060642009-01-31 22:14:21 +00003327 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003328 return NULL;
3329}
3330
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003331static size_t
3332wcstombs_errorpos(const wchar_t *wstr)
3333{
3334 size_t len;
3335#if SIZEOF_WCHAR_T == 2
3336 wchar_t buf[3];
3337#else
3338 wchar_t buf[2];
3339#endif
3340 char outbuf[MB_LEN_MAX];
3341 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003342
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003343#if SIZEOF_WCHAR_T == 2
3344 buf[2] = 0;
3345#else
3346 buf[1] = 0;
3347#endif
3348 start = wstr;
3349 while (*wstr != L'\0')
3350 {
3351 previous = wstr;
3352#if SIZEOF_WCHAR_T == 2
3353 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3354 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3355 {
3356 buf[0] = wstr[0];
3357 buf[1] = wstr[1];
3358 wstr += 2;
3359 }
3360 else {
3361 buf[0] = *wstr;
3362 buf[1] = 0;
3363 wstr++;
3364 }
3365#else
3366 buf[0] = *wstr;
3367 wstr++;
3368#endif
3369 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003370 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003371 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003372 }
3373
3374 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003375 return 0;
3376}
3377
Victor Stinner1b579672011-12-17 05:47:23 +01003378static int
3379locale_error_handler(const char *errors, int *surrogateescape)
3380{
Victor Stinner50149202015-09-22 00:26:54 +02003381 _Py_error_handler error_handler = get_error_handler(errors);
3382 switch (error_handler)
3383 {
3384 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003385 *surrogateescape = 0;
3386 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003387 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003388 *surrogateescape = 1;
3389 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003390 default:
3391 PyErr_Format(PyExc_ValueError,
3392 "only 'strict' and 'surrogateescape' error handlers "
3393 "are supported, not '%s'",
3394 errors);
3395 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003396 }
Victor Stinner1b579672011-12-17 05:47:23 +01003397}
3398
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003399PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003400PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003401{
3402 Py_ssize_t wlen, wlen2;
3403 wchar_t *wstr;
3404 PyObject *bytes = NULL;
3405 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003406 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003407 PyObject *exc;
3408 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003409 int surrogateescape;
3410
3411 if (locale_error_handler(errors, &surrogateescape) < 0)
3412 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003413
3414 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3415 if (wstr == NULL)
3416 return NULL;
3417
3418 wlen2 = wcslen(wstr);
3419 if (wlen2 != wlen) {
3420 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003421 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003422 return NULL;
3423 }
3424
3425 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003426 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003427 char *str;
3428
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003429 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003430 if (str == NULL) {
3431 if (error_pos == (size_t)-1) {
3432 PyErr_NoMemory();
3433 PyMem_Free(wstr);
3434 return NULL;
3435 }
3436 else {
3437 goto encode_error;
3438 }
3439 }
3440 PyMem_Free(wstr);
3441
3442 bytes = PyBytes_FromString(str);
3443 PyMem_Free(str);
3444 }
3445 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003446 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003447 size_t len, len2;
3448
3449 len = wcstombs(NULL, wstr, 0);
3450 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003451 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003452 goto encode_error;
3453 }
3454
3455 bytes = PyBytes_FromStringAndSize(NULL, len);
3456 if (bytes == NULL) {
3457 PyMem_Free(wstr);
3458 return NULL;
3459 }
3460
3461 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3462 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003463 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003464 goto encode_error;
3465 }
3466 PyMem_Free(wstr);
3467 }
3468 return bytes;
3469
3470encode_error:
3471 errmsg = strerror(errno);
3472 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003473
3474 if (error_pos == (size_t)-1)
3475 error_pos = wcstombs_errorpos(wstr);
3476
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003477 PyMem_Free(wstr);
3478 Py_XDECREF(bytes);
3479
Victor Stinner2f197072011-12-17 07:08:30 +01003480 if (errmsg != NULL) {
3481 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003482 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003483 if (wstr != NULL) {
3484 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003485 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003486 } else
3487 errmsg = NULL;
3488 }
3489 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003490 reason = PyUnicode_FromString(
3491 "wcstombs() encountered an unencodable "
3492 "wide character");
3493 if (reason == NULL)
3494 return NULL;
3495
3496 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3497 "locale", unicode,
3498 (Py_ssize_t)error_pos,
3499 (Py_ssize_t)(error_pos+1),
3500 reason);
3501 Py_DECREF(reason);
3502 if (exc != NULL) {
3503 PyCodec_StrictErrors(exc);
3504 Py_XDECREF(exc);
3505 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003506 return NULL;
3507}
3508
Victor Stinnerad158722010-10-27 00:25:46 +00003509PyObject *
3510PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003511{
Steve Dowercc16be82016-09-08 10:35:16 -07003512#if defined(__APPLE__)
3513 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003514#else
Victor Stinner793b5312011-04-27 00:24:21 +02003515 PyInterpreterState *interp = PyThreadState_GET()->interp;
3516 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3517 cannot use it to encode and decode filenames before it is loaded. Load
3518 the Python codec requires to encode at least its own filename. Use the C
3519 version of the locale codec until the codec registry is initialized and
3520 the Python codec is loaded.
3521
3522 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3523 cannot only rely on it: check also interp->fscodec_initialized for
3524 subinterpreters. */
3525 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003526 return PyUnicode_AsEncodedString(unicode,
3527 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003528 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003529 }
3530 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003531 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003532 }
Victor Stinnerad158722010-10-27 00:25:46 +00003533#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003534}
3535
Alexander Belopolsky40018472011-02-26 01:02:56 +00003536PyObject *
3537PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003538 const char *encoding,
3539 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540{
3541 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003542 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003543
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544 if (!PyUnicode_Check(unicode)) {
3545 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003546 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547 }
Fred Drakee4315f52000-05-09 19:53:39 +00003548
Victor Stinner942889a2016-09-05 15:40:10 -07003549 if (encoding == NULL) {
3550 return _PyUnicode_AsUTF8String(unicode, errors);
3551 }
3552
Fred Drakee4315f52000-05-09 19:53:39 +00003553 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003554 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3555 char *lower = buflower;
3556
3557 /* Fast paths */
3558 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3559 lower += 3;
3560 if (*lower == '_') {
3561 /* Match "utf8" and "utf_8" */
3562 lower++;
3563 }
3564
3565 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003566 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003567 }
3568 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3569 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3570 }
3571 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3572 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3573 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003574 }
Victor Stinner942889a2016-09-05 15:40:10 -07003575 else {
3576 if (strcmp(lower, "ascii") == 0
3577 || strcmp(lower, "us_ascii") == 0) {
3578 return _PyUnicode_AsASCIIString(unicode, errors);
3579 }
Steve Dowercc16be82016-09-08 10:35:16 -07003580#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003581 else if (strcmp(lower, "mbcs") == 0) {
3582 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3583 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003584#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003585 else if (strcmp(lower, "latin1") == 0 ||
3586 strcmp(lower, "latin_1") == 0 ||
3587 strcmp(lower, "iso_8859_1") == 0 ||
3588 strcmp(lower, "iso8859_1") == 0) {
3589 return _PyUnicode_AsLatin1String(unicode, errors);
3590 }
3591 }
Victor Stinner37296e82010-06-10 13:36:23 +00003592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593
3594 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003595 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003597 return NULL;
3598
3599 /* The normal path */
3600 if (PyBytes_Check(v))
3601 return v;
3602
3603 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003604 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003605 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003606 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003607
3608 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003609 "encoder %s returned bytearray instead of bytes; "
3610 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003611 encoding);
3612 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003613 Py_DECREF(v);
3614 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003615 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003616
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003617 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3618 Py_DECREF(v);
3619 return b;
3620 }
3621
3622 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003623 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3624 "use codecs.encode() to encode to arbitrary types",
3625 encoding,
3626 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003627 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003628 return NULL;
3629}
3630
Alexander Belopolsky40018472011-02-26 01:02:56 +00003631PyObject *
3632PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003633 const char *encoding,
3634 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003635{
3636 PyObject *v;
3637
3638 if (!PyUnicode_Check(unicode)) {
3639 PyErr_BadArgument();
3640 goto onError;
3641 }
3642
3643 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003644 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003645
3646 /* Encode via the codec registry */
3647 v = PyCodec_Encode(unicode, encoding, errors);
3648 if (v == NULL)
3649 goto onError;
3650 if (!PyUnicode_Check(v)) {
3651 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003652 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3653 "use codecs.encode() to encode to arbitrary types",
3654 encoding,
3655 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003656 Py_DECREF(v);
3657 goto onError;
3658 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003660
Benjamin Peterson29060642009-01-31 22:14:21 +00003661 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 return NULL;
3663}
3664
Victor Stinner2f197072011-12-17 07:08:30 +01003665static size_t
3666mbstowcs_errorpos(const char *str, size_t len)
3667{
3668#ifdef HAVE_MBRTOWC
3669 const char *start = str;
3670 mbstate_t mbs;
3671 size_t converted;
3672 wchar_t ch;
3673
3674 memset(&mbs, 0, sizeof mbs);
3675 while (len)
3676 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003677 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003678 if (converted == 0)
3679 /* Reached end of string */
3680 break;
3681 if (converted == (size_t)-1 || converted == (size_t)-2) {
3682 /* Conversion error or incomplete character */
3683 return str - start;
3684 }
3685 else {
3686 str += converted;
3687 len -= converted;
3688 }
3689 }
3690 /* failed to find the undecodable byte sequence */
3691 return 0;
3692#endif
3693 return 0;
3694}
3695
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003696PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003697PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003698 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003699{
3700 wchar_t smallbuf[256];
3701 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3702 wchar_t *wstr;
3703 size_t wlen, wlen2;
3704 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003705 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003706 size_t error_pos;
3707 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003708 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3709 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003710
3711 if (locale_error_handler(errors, &surrogateescape) < 0)
3712 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003713
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003714 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3715 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003716 return NULL;
3717 }
3718
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003719 if (surrogateescape) {
3720 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003721 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003722 if (wstr == NULL) {
3723 if (wlen == (size_t)-1)
3724 PyErr_NoMemory();
3725 else
3726 PyErr_SetFromErrno(PyExc_OSError);
3727 return NULL;
3728 }
3729
3730 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003731 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003732 }
3733 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003734 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003735#ifndef HAVE_BROKEN_MBSTOWCS
3736 wlen = mbstowcs(NULL, str, 0);
3737#else
3738 wlen = len;
3739#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003740 if (wlen == (size_t)-1)
3741 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003742 if (wlen+1 <= smallbuf_len) {
3743 wstr = smallbuf;
3744 }
3745 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003746 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003747 if (!wstr)
3748 return PyErr_NoMemory();
3749 }
3750
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003751 wlen2 = mbstowcs(wstr, str, wlen+1);
3752 if (wlen2 == (size_t)-1) {
3753 if (wstr != smallbuf)
3754 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003755 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003756 }
3757#ifdef HAVE_BROKEN_MBSTOWCS
3758 assert(wlen2 == wlen);
3759#endif
3760 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3761 if (wstr != smallbuf)
3762 PyMem_Free(wstr);
3763 }
3764 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003765
3766decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003767 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003768 errmsg = strerror(errno);
3769 assert(errmsg != NULL);
3770
3771 error_pos = mbstowcs_errorpos(str, len);
3772 if (errmsg != NULL) {
3773 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003774 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003775 if (wstr != NULL) {
3776 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003777 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003778 }
Victor Stinner2f197072011-12-17 07:08:30 +01003779 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003780 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003781 reason = PyUnicode_FromString(
3782 "mbstowcs() encountered an invalid multibyte sequence");
3783 if (reason == NULL)
3784 return NULL;
3785
3786 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3787 "locale", str, len,
3788 (Py_ssize_t)error_pos,
3789 (Py_ssize_t)(error_pos+1),
3790 reason);
3791 Py_DECREF(reason);
3792 if (exc != NULL) {
3793 PyCodec_StrictErrors(exc);
3794 Py_XDECREF(exc);
3795 }
3796 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003797}
3798
3799PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003800PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003801{
3802 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003803 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003804}
3805
3806
3807PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003808PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003809 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003810 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3811}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003812
Christian Heimes5894ba72007-11-04 11:43:14 +00003813PyObject*
3814PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3815{
Steve Dowercc16be82016-09-08 10:35:16 -07003816#if defined(__APPLE__)
3817 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003818#else
Victor Stinner793b5312011-04-27 00:24:21 +02003819 PyInterpreterState *interp = PyThreadState_GET()->interp;
3820 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3821 cannot use it to encode and decode filenames before it is loaded. Load
3822 the Python codec requires to encode at least its own filename. Use the C
3823 version of the locale codec until the codec registry is initialized and
3824 the Python codec is loaded.
3825
3826 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3827 cannot only rely on it: check also interp->fscodec_initialized for
3828 subinterpreters. */
3829 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dowercc16be82016-09-08 10:35:16 -07003830 PyObject *res = PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003831 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003832 Py_FileSystemDefaultEncodeErrors);
3833#ifdef MS_WINDOWS
3834 if (!res && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
3835 PyObject *exc, *val, *tb;
3836 PyErr_Fetch(&exc, &val, &tb);
3837 PyErr_Format(PyExc_RuntimeError,
3838 "filesystem path bytes were not correctly encoded with '%s'. " \
3839 "Please report this at http://bugs.python.org/issue27781",
3840 Py_FileSystemDefaultEncoding);
3841 _PyErr_ChainExceptions(exc, val, tb);
3842 }
3843#endif
3844 return res;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003845 }
3846 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003847 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003848 }
Victor Stinnerad158722010-10-27 00:25:46 +00003849#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003850}
3851
Martin v. Löwis011e8422009-05-05 04:43:17 +00003852
3853int
3854PyUnicode_FSConverter(PyObject* arg, void* addr)
3855{
Brett Cannonec6ce872016-09-06 15:50:29 -07003856 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003857 PyObject *output = NULL;
3858 Py_ssize_t size;
3859 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003860 if (arg == NULL) {
3861 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003862 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003863 return 1;
3864 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003865 path = PyOS_FSPath(arg);
3866 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003867 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003868 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003869 if (PyBytes_Check(path)) {
3870 output = path;
3871 }
3872 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3873 output = PyUnicode_EncodeFSDefault(path);
3874 Py_DECREF(path);
3875 if (!output) {
3876 return 0;
3877 }
3878 assert(PyBytes_Check(output));
3879 }
3880
Victor Stinner0ea2a462010-04-30 00:22:08 +00003881 size = PyBytes_GET_SIZE(output);
3882 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003883 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003884 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003885 Py_DECREF(output);
3886 return 0;
3887 }
3888 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003889 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003890}
3891
3892
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003893int
3894PyUnicode_FSDecoder(PyObject* arg, void* addr)
3895{
Brett Cannona5711202016-09-06 19:36:01 -07003896 int is_buffer = 0;
3897 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003898 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003899 if (arg == NULL) {
3900 Py_DECREF(*(PyObject**)addr);
3901 return 1;
3902 }
Brett Cannona5711202016-09-06 19:36:01 -07003903
3904 is_buffer = PyObject_CheckBuffer(arg);
3905 if (!is_buffer) {
3906 path = PyOS_FSPath(arg);
3907 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003908 return 0;
3909 }
Brett Cannona5711202016-09-06 19:36:01 -07003910 }
3911 else {
3912 path = arg;
3913 Py_INCREF(arg);
3914 }
3915
3916 if (PyUnicode_Check(path)) {
3917 if (PyUnicode_READY(path) == -1) {
3918 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003919 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003920 }
3921 output = path;
3922 }
3923 else if (PyBytes_Check(path) || is_buffer) {
3924 PyObject *path_bytes = NULL;
3925
3926 if (!PyBytes_Check(path) &&
3927 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3928 "path should be string, bytes, or os.PathLike, not %.200s",
3929 Py_TYPE(arg)->tp_name)) {
3930 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003931 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003932 }
3933 path_bytes = PyBytes_FromObject(path);
3934 Py_DECREF(path);
3935 if (!path_bytes) {
3936 return 0;
3937 }
3938 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3939 PyBytes_GET_SIZE(path_bytes));
3940 Py_DECREF(path_bytes);
3941 if (!output) {
3942 return 0;
3943 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003944 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003945 else {
3946 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003947 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003948 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003949 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003950 return 0;
3951 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003952 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003953 Py_DECREF(output);
3954 return 0;
3955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003956 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003957 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003958 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003959 Py_DECREF(output);
3960 return 0;
3961 }
3962 *(PyObject**)addr = output;
3963 return Py_CLEANUP_SUPPORTED;
3964}
3965
3966
Martin v. Löwis5b222132007-06-10 09:51:05 +00003967char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003968PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003969{
Christian Heimesf3863112007-11-22 07:46:41 +00003970 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003971
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003972 if (!PyUnicode_Check(unicode)) {
3973 PyErr_BadArgument();
3974 return NULL;
3975 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003976 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003977 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003979 if (PyUnicode_UTF8(unicode) == NULL) {
3980 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003981 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003982 if (bytes == NULL)
3983 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003984 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3985 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003986 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 Py_DECREF(bytes);
3988 return NULL;
3989 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003990 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003991 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003992 PyBytes_AS_STRING(bytes),
3993 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003994 Py_DECREF(bytes);
3995 }
3996
3997 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003998 *psize = PyUnicode_UTF8_LENGTH(unicode);
3999 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004000}
4001
4002char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004004{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4006}
4007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008Py_UNICODE *
4009PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4010{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011 const unsigned char *one_byte;
4012#if SIZEOF_WCHAR_T == 4
4013 const Py_UCS2 *two_bytes;
4014#else
4015 const Py_UCS4 *four_bytes;
4016 const Py_UCS4 *ucs4_end;
4017 Py_ssize_t num_surrogates;
4018#endif
4019 wchar_t *w;
4020 wchar_t *wchar_end;
4021
4022 if (!PyUnicode_Check(unicode)) {
4023 PyErr_BadArgument();
4024 return NULL;
4025 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004026 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004027 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004028 assert(_PyUnicode_KIND(unicode) != 0);
4029 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004031 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004032#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004033 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4034 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035 num_surrogates = 0;
4036
4037 for (; four_bytes < ucs4_end; ++four_bytes) {
4038 if (*four_bytes > 0xFFFF)
4039 ++num_surrogates;
4040 }
4041
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004042 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4043 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4044 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045 PyErr_NoMemory();
4046 return NULL;
4047 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004048 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004049
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004050 w = _PyUnicode_WSTR(unicode);
4051 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4052 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004053 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4054 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004055 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004056 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004057 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4058 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004059 }
4060 else
4061 *w = *four_bytes;
4062
4063 if (w > wchar_end) {
4064 assert(0 && "Miscalculated string end");
4065 }
4066 }
4067 *w = 0;
4068#else
4069 /* sizeof(wchar_t) == 4 */
4070 Py_FatalError("Impossible unicode object state, wstr and str "
4071 "should share memory already.");
4072 return NULL;
4073#endif
4074 }
4075 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004076 if ((size_t)_PyUnicode_LENGTH(unicode) >
4077 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4078 PyErr_NoMemory();
4079 return NULL;
4080 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004081 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4082 (_PyUnicode_LENGTH(unicode) + 1));
4083 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004084 PyErr_NoMemory();
4085 return NULL;
4086 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004087 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4088 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4089 w = _PyUnicode_WSTR(unicode);
4090 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004091
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004092 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4093 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004094 for (; w < wchar_end; ++one_byte, ++w)
4095 *w = *one_byte;
4096 /* null-terminate the wstr */
4097 *w = 0;
4098 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004099 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004100#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004101 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004102 for (; w < wchar_end; ++two_bytes, ++w)
4103 *w = *two_bytes;
4104 /* null-terminate the wstr */
4105 *w = 0;
4106#else
4107 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004108 PyObject_FREE(_PyUnicode_WSTR(unicode));
4109 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004110 Py_FatalError("Impossible unicode object state, wstr "
4111 "and str should share memory already.");
4112 return NULL;
4113#endif
4114 }
4115 else {
4116 assert(0 && "This should never happen.");
4117 }
4118 }
4119 }
4120 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004121 *size = PyUnicode_WSTR_LENGTH(unicode);
4122 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004123}
4124
Alexander Belopolsky40018472011-02-26 01:02:56 +00004125Py_UNICODE *
4126PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004128 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129}
4130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004131
Alexander Belopolsky40018472011-02-26 01:02:56 +00004132Py_ssize_t
4133PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134{
4135 if (!PyUnicode_Check(unicode)) {
4136 PyErr_BadArgument();
4137 goto onError;
4138 }
4139 return PyUnicode_GET_SIZE(unicode);
4140
Benjamin Peterson29060642009-01-31 22:14:21 +00004141 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142 return -1;
4143}
4144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004145Py_ssize_t
4146PyUnicode_GetLength(PyObject *unicode)
4147{
Victor Stinner07621332012-06-16 04:53:46 +02004148 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004149 PyErr_BadArgument();
4150 return -1;
4151 }
Victor Stinner07621332012-06-16 04:53:46 +02004152 if (PyUnicode_READY(unicode) == -1)
4153 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004154 return PyUnicode_GET_LENGTH(unicode);
4155}
4156
4157Py_UCS4
4158PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4159{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004160 void *data;
4161 int kind;
4162
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004163 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4164 PyErr_BadArgument();
4165 return (Py_UCS4)-1;
4166 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004167 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004168 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004169 return (Py_UCS4)-1;
4170 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004171 data = PyUnicode_DATA(unicode);
4172 kind = PyUnicode_KIND(unicode);
4173 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004174}
4175
4176int
4177PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4178{
4179 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004180 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004181 return -1;
4182 }
Victor Stinner488fa492011-12-12 00:01:39 +01004183 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004184 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004185 PyErr_SetString(PyExc_IndexError, "string index out of range");
4186 return -1;
4187 }
Victor Stinner488fa492011-12-12 00:01:39 +01004188 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004189 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004190 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4191 PyErr_SetString(PyExc_ValueError, "character out of range");
4192 return -1;
4193 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004194 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4195 index, ch);
4196 return 0;
4197}
4198
Alexander Belopolsky40018472011-02-26 01:02:56 +00004199const char *
4200PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004201{
Victor Stinner42cb4622010-09-01 19:39:01 +00004202 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004203}
4204
Victor Stinner554f3f02010-06-16 23:33:54 +00004205/* create or adjust a UnicodeDecodeError */
4206static void
4207make_decode_exception(PyObject **exceptionObject,
4208 const char *encoding,
4209 const char *input, Py_ssize_t length,
4210 Py_ssize_t startpos, Py_ssize_t endpos,
4211 const char *reason)
4212{
4213 if (*exceptionObject == NULL) {
4214 *exceptionObject = PyUnicodeDecodeError_Create(
4215 encoding, input, length, startpos, endpos, reason);
4216 }
4217 else {
4218 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4219 goto onError;
4220 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4221 goto onError;
4222 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4223 goto onError;
4224 }
4225 return;
4226
4227onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004228 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004229}
4230
Steve Dowercc16be82016-09-08 10:35:16 -07004231#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232/* error handling callback helper:
4233 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004234 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235 and adjust various state variables.
4236 return 0 on success, -1 on error
4237*/
4238
Alexander Belopolsky40018472011-02-26 01:02:56 +00004239static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004240unicode_decode_call_errorhandler_wchar(
4241 const char *errors, PyObject **errorHandler,
4242 const char *encoding, const char *reason,
4243 const char **input, const char **inend, Py_ssize_t *startinpos,
4244 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4245 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004246{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004247 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248
4249 PyObject *restuple = NULL;
4250 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004251 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004252 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004253 Py_ssize_t requiredsize;
4254 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004255 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004256 wchar_t *repwstr;
4257 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004259 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4260 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004263 *errorHandler = PyCodec_LookupError(errors);
4264 if (*errorHandler == NULL)
4265 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004266 }
4267
Victor Stinner554f3f02010-06-16 23:33:54 +00004268 make_decode_exception(exceptionObject,
4269 encoding,
4270 *input, *inend - *input,
4271 *startinpos, *endinpos,
4272 reason);
4273 if (*exceptionObject == NULL)
4274 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004275
4276 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4277 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004278 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004280 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004281 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 }
4283 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004284 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004285
4286 /* Copy back the bytes variables, which might have been modified by the
4287 callback */
4288 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4289 if (!inputobj)
4290 goto onError;
4291 if (!PyBytes_Check(inputobj)) {
4292 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4293 }
4294 *input = PyBytes_AS_STRING(inputobj);
4295 insize = PyBytes_GET_SIZE(inputobj);
4296 *inend = *input + insize;
4297 /* we can DECREF safely, as the exception has another reference,
4298 so the object won't go away. */
4299 Py_DECREF(inputobj);
4300
4301 if (newpos<0)
4302 newpos = insize+newpos;
4303 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004304 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004305 goto onError;
4306 }
4307
4308 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4309 if (repwstr == NULL)
4310 goto onError;
4311 /* need more space? (at least enough for what we
4312 have+the replacement+the rest of the string (starting
4313 at the new input position), so we won't have to check space
4314 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004315 requiredsize = *outpos;
4316 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4317 goto overflow;
4318 requiredsize += repwlen;
4319 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4320 goto overflow;
4321 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004322 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004323 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004324 requiredsize = 2*outsize;
4325 if (unicode_resize(output, requiredsize) < 0)
4326 goto onError;
4327 }
4328 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4329 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004330 *endinpos = newpos;
4331 *inptr = *input + newpos;
4332
4333 /* we made it! */
4334 Py_XDECREF(restuple);
4335 return 0;
4336
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004337 overflow:
4338 PyErr_SetString(PyExc_OverflowError,
4339 "decoded result is too long for a Python string");
4340
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004341 onError:
4342 Py_XDECREF(restuple);
4343 return -1;
4344}
Steve Dowercc16be82016-09-08 10:35:16 -07004345#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004346
4347static int
4348unicode_decode_call_errorhandler_writer(
4349 const char *errors, PyObject **errorHandler,
4350 const char *encoding, const char *reason,
4351 const char **input, const char **inend, Py_ssize_t *startinpos,
4352 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4353 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4354{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004355 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004356
4357 PyObject *restuple = NULL;
4358 PyObject *repunicode = NULL;
4359 Py_ssize_t insize;
4360 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004361 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004362 PyObject *inputobj = NULL;
4363
4364 if (*errorHandler == NULL) {
4365 *errorHandler = PyCodec_LookupError(errors);
4366 if (*errorHandler == NULL)
4367 goto onError;
4368 }
4369
4370 make_decode_exception(exceptionObject,
4371 encoding,
4372 *input, *inend - *input,
4373 *startinpos, *endinpos,
4374 reason);
4375 if (*exceptionObject == NULL)
4376 goto onError;
4377
4378 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4379 if (restuple == NULL)
4380 goto onError;
4381 if (!PyTuple_Check(restuple)) {
4382 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4383 goto onError;
4384 }
4385 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004386 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004387
4388 /* Copy back the bytes variables, which might have been modified by the
4389 callback */
4390 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4391 if (!inputobj)
4392 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004393 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004394 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004395 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004396 *input = PyBytes_AS_STRING(inputobj);
4397 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004398 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004399 /* we can DECREF safely, as the exception has another reference,
4400 so the object won't go away. */
4401 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004402
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004404 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004405 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004406 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004408 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409
Victor Stinner8f674cc2013-04-17 23:02:17 +02004410 if (PyUnicode_READY(repunicode) < 0)
4411 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004412 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004413 if (replen > 1) {
4414 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004415 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004416 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4417 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4418 goto onError;
4419 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004420 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004421 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004422
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004424 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004425
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004427 Py_XDECREF(restuple);
4428 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429
Benjamin Peterson29060642009-01-31 22:14:21 +00004430 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004432 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433}
4434
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004435/* --- UTF-7 Codec -------------------------------------------------------- */
4436
Antoine Pitrou244651a2009-05-04 18:56:13 +00004437/* See RFC2152 for details. We encode conservatively and decode liberally. */
4438
4439/* Three simple macros defining base-64. */
4440
4441/* Is c a base-64 character? */
4442
4443#define IS_BASE64(c) \
4444 (((c) >= 'A' && (c) <= 'Z') || \
4445 ((c) >= 'a' && (c) <= 'z') || \
4446 ((c) >= '0' && (c) <= '9') || \
4447 (c) == '+' || (c) == '/')
4448
4449/* given that c is a base-64 character, what is its base-64 value? */
4450
4451#define FROM_BASE64(c) \
4452 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4453 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4454 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4455 (c) == '+' ? 62 : 63)
4456
4457/* What is the base-64 character of the bottom 6 bits of n? */
4458
4459#define TO_BASE64(n) \
4460 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4461
4462/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4463 * decoded as itself. We are permissive on decoding; the only ASCII
4464 * byte not decoding to itself is the + which begins a base64
4465 * string. */
4466
4467#define DECODE_DIRECT(c) \
4468 ((c) <= 127 && (c) != '+')
4469
4470/* The UTF-7 encoder treats ASCII characters differently according to
4471 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4472 * the above). See RFC2152. This array identifies these different
4473 * sets:
4474 * 0 : "Set D"
4475 * alphanumeric and '(),-./:?
4476 * 1 : "Set O"
4477 * !"#$%&*;<=>@[]^_`{|}
4478 * 2 : "whitespace"
4479 * ht nl cr sp
4480 * 3 : special (must be base64 encoded)
4481 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4482 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004483
Tim Petersced69f82003-09-16 20:30:58 +00004484static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004485char utf7_category[128] = {
4486/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4487 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4488/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4489 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4490/* sp ! " # $ % & ' ( ) * + , - . / */
4491 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4492/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4494/* @ A B C D E F G H I J K L M N O */
4495 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4496/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4497 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4498/* ` a b c d e f g h i j k l m n o */
4499 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4500/* p q r s t u v w x y z { | } ~ del */
4501 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502};
4503
Antoine Pitrou244651a2009-05-04 18:56:13 +00004504/* ENCODE_DIRECT: this character should be encoded as itself. The
4505 * answer depends on whether we are encoding set O as itself, and also
4506 * on whether we are encoding whitespace as itself. RFC2152 makes it
4507 * clear that the answers to these questions vary between
4508 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004509
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510#define ENCODE_DIRECT(c, directO, directWS) \
4511 ((c) < 128 && (c) > 0 && \
4512 ((utf7_category[(c)] == 0) || \
4513 (directWS && (utf7_category[(c)] == 2)) || \
4514 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004515
Alexander Belopolsky40018472011-02-26 01:02:56 +00004516PyObject *
4517PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004518 Py_ssize_t size,
4519 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004520{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004521 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4522}
4523
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524/* The decoder. The only state we preserve is our read position,
4525 * i.e. how many characters we have consumed. So if we end in the
4526 * middle of a shift sequence we have to back off the read position
4527 * and the output to the beginning of the sequence, otherwise we lose
4528 * all the shift state (seen bits, number of bits seen, high
4529 * surrogate). */
4530
Alexander Belopolsky40018472011-02-26 01:02:56 +00004531PyObject *
4532PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004533 Py_ssize_t size,
4534 const char *errors,
4535 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004536{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004538 Py_ssize_t startinpos;
4539 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004540 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004541 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542 const char *errmsg = "";
4543 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004544 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 unsigned int base64bits = 0;
4546 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004547 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548 PyObject *errorHandler = NULL;
4549 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004551 if (size == 0) {
4552 if (consumed)
4553 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004554 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004555 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004557 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004558 _PyUnicodeWriter_Init(&writer);
4559 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004560
4561 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004562 e = s + size;
4563
4564 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004565 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004566 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004567 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004568
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569 if (inShift) { /* in a base-64 section */
4570 if (IS_BASE64(ch)) { /* consume a base-64 character */
4571 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4572 base64bits += 6;
4573 s++;
4574 if (base64bits >= 16) {
4575 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004576 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004577 base64bits -= 16;
4578 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004579 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580 if (surrogate) {
4581 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004582 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4583 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004584 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004585 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004586 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004587 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004588 }
4589 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004590 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004591 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 }
4594 }
Victor Stinner551ac952011-11-29 22:58:13 +01004595 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004596 /* first surrogate */
4597 surrogate = outCh;
4598 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004599 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004600 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004601 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004602 }
4603 }
4604 }
4605 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004606 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004607 if (base64bits > 0) { /* left-over bits */
4608 if (base64bits >= 6) {
4609 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004610 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004611 errmsg = "partial character in shift sequence";
4612 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004613 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004614 else {
4615 /* Some bits remain; they should be zero */
4616 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004617 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 errmsg = "non-zero padding bits in shift sequence";
4619 goto utf7Error;
4620 }
4621 }
4622 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004623 if (surrogate && DECODE_DIRECT(ch)) {
4624 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4625 goto onError;
4626 }
4627 surrogate = 0;
4628 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004629 /* '-' is absorbed; other terminating
4630 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004631 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004632 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004633 }
4634 }
4635 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004637 s++; /* consume '+' */
4638 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004639 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004640 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004641 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004642 }
4643 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004644 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004645 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004646 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004647 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004648 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004649 }
4650 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004651 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004652 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004653 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004654 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004655 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004656 else {
4657 startinpos = s-starts;
4658 s++;
4659 errmsg = "unexpected special character";
4660 goto utf7Error;
4661 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004662 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004665 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004666 errors, &errorHandler,
4667 "utf7", errmsg,
4668 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004669 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004670 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004671 }
4672
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673 /* end of string */
4674
4675 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4676 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004677 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678 if (surrogate ||
4679 (base64bits >= 6) ||
4680 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004681 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004682 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004683 errors, &errorHandler,
4684 "utf7", "unterminated shift sequence",
4685 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004686 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004687 goto onError;
4688 if (s < e)
4689 goto restart;
4690 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004691 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004692
4693 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004694 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004695 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004696 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004697 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004698 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004699 writer.kind, writer.data, shiftOutStart);
4700 Py_XDECREF(errorHandler);
4701 Py_XDECREF(exc);
4702 _PyUnicodeWriter_Dealloc(&writer);
4703 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004704 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004705 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004706 }
4707 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004708 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004709 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004710 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004711
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712 Py_XDECREF(errorHandler);
4713 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004714 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004715
Benjamin Peterson29060642009-01-31 22:14:21 +00004716 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004717 Py_XDECREF(errorHandler);
4718 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004719 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004720 return NULL;
4721}
4722
4723
Alexander Belopolsky40018472011-02-26 01:02:56 +00004724PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004725_PyUnicode_EncodeUTF7(PyObject *str,
4726 int base64SetO,
4727 int base64WhiteSpace,
4728 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004729{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004730 int kind;
4731 void *data;
4732 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004733 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004734 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004735 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004736 unsigned int base64bits = 0;
4737 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004738 char * out;
4739 char * start;
4740
Benjamin Petersonbac79492012-01-14 13:34:47 -05004741 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004742 return NULL;
4743 kind = PyUnicode_KIND(str);
4744 data = PyUnicode_DATA(str);
4745 len = PyUnicode_GET_LENGTH(str);
4746
4747 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004748 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004749
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004750 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004751 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004752 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004753 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004754 if (v == NULL)
4755 return NULL;
4756
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004757 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004758 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004759 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004760
Antoine Pitrou244651a2009-05-04 18:56:13 +00004761 if (inShift) {
4762 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4763 /* shifting out */
4764 if (base64bits) { /* output remaining bits */
4765 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4766 base64buffer = 0;
4767 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004768 }
4769 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004770 /* Characters not in the BASE64 set implicitly unshift the sequence
4771 so no '-' is required, except if the character is itself a '-' */
4772 if (IS_BASE64(ch) || ch == '-') {
4773 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004774 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004775 *out++ = (char) ch;
4776 }
4777 else {
4778 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004779 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004780 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004781 else { /* not in a shift sequence */
4782 if (ch == '+') {
4783 *out++ = '+';
4784 *out++ = '-';
4785 }
4786 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4787 *out++ = (char) ch;
4788 }
4789 else {
4790 *out++ = '+';
4791 inShift = 1;
4792 goto encode_char;
4793 }
4794 }
4795 continue;
4796encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004797 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004798 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004799
Antoine Pitrou244651a2009-05-04 18:56:13 +00004800 /* code first surrogate */
4801 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004802 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004803 while (base64bits >= 6) {
4804 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4805 base64bits -= 6;
4806 }
4807 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004808 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004809 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004810 base64bits += 16;
4811 base64buffer = (base64buffer << 16) | ch;
4812 while (base64bits >= 6) {
4813 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4814 base64bits -= 6;
4815 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004816 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004817 if (base64bits)
4818 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4819 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004820 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004821 if (_PyBytes_Resize(&v, out - start) < 0)
4822 return NULL;
4823 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004824}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004825PyObject *
4826PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4827 Py_ssize_t size,
4828 int base64SetO,
4829 int base64WhiteSpace,
4830 const char *errors)
4831{
4832 PyObject *result;
4833 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4834 if (tmp == NULL)
4835 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004836 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004837 base64WhiteSpace, errors);
4838 Py_DECREF(tmp);
4839 return result;
4840}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004841
Antoine Pitrou244651a2009-05-04 18:56:13 +00004842#undef IS_BASE64
4843#undef FROM_BASE64
4844#undef TO_BASE64
4845#undef DECODE_DIRECT
4846#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004847
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848/* --- UTF-8 Codec -------------------------------------------------------- */
4849
Alexander Belopolsky40018472011-02-26 01:02:56 +00004850PyObject *
4851PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004852 Py_ssize_t size,
4853 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854{
Walter Dörwald69652032004-09-07 20:24:22 +00004855 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4856}
4857
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004858#include "stringlib/asciilib.h"
4859#include "stringlib/codecs.h"
4860#include "stringlib/undef.h"
4861
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004862#include "stringlib/ucs1lib.h"
4863#include "stringlib/codecs.h"
4864#include "stringlib/undef.h"
4865
4866#include "stringlib/ucs2lib.h"
4867#include "stringlib/codecs.h"
4868#include "stringlib/undef.h"
4869
4870#include "stringlib/ucs4lib.h"
4871#include "stringlib/codecs.h"
4872#include "stringlib/undef.h"
4873
Antoine Pitrouab868312009-01-10 15:40:25 +00004874/* Mask to quickly check whether a C 'long' contains a
4875 non-ASCII, UTF8-encoded char. */
4876#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004877# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004878#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004879# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004880#else
4881# error C 'long' size should be either 4 or 8!
4882#endif
4883
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004884static Py_ssize_t
4885ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004886{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004888 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004889
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004890 /*
4891 * Issue #17237: m68k is a bit different from most architectures in
4892 * that objects do not use "natural alignment" - for example, int and
4893 * long are only aligned at 2-byte boundaries. Therefore the assert()
4894 * won't work; also, tests have shown that skipping the "optimised
4895 * version" will even speed up m68k.
4896 */
4897#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004898#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004899 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4900 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004901 /* Fast path, see in STRINGLIB(utf8_decode) for
4902 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004903 /* Help allocation */
4904 const char *_p = p;
4905 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004906 while (_p < aligned_end) {
4907 unsigned long value = *(const unsigned long *) _p;
4908 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004909 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004910 *((unsigned long *)q) = value;
4911 _p += SIZEOF_LONG;
4912 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004913 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914 p = _p;
4915 while (p < end) {
4916 if ((unsigned char)*p & 0x80)
4917 break;
4918 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004920 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004922#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004923#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004924 while (p < end) {
4925 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4926 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004927 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004928 /* Help allocation */
4929 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004930 while (_p < aligned_end) {
4931 unsigned long value = *(unsigned long *) _p;
4932 if (value & ASCII_CHAR_MASK)
4933 break;
4934 _p += SIZEOF_LONG;
4935 }
4936 p = _p;
4937 if (_p == end)
4938 break;
4939 }
4940 if ((unsigned char)*p & 0x80)
4941 break;
4942 ++p;
4943 }
4944 memcpy(dest, start, p - start);
4945 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946}
Antoine Pitrouab868312009-01-10 15:40:25 +00004947
Victor Stinner785938e2011-12-11 20:09:03 +01004948PyObject *
4949PyUnicode_DecodeUTF8Stateful(const char *s,
4950 Py_ssize_t size,
4951 const char *errors,
4952 Py_ssize_t *consumed)
4953{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004954 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004955 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004956 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004957
4958 Py_ssize_t startinpos;
4959 Py_ssize_t endinpos;
4960 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004961 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004962 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004963 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004964
4965 if (size == 0) {
4966 if (consumed)
4967 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004968 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004969 }
4970
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004971 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4972 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004973 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004974 *consumed = 1;
4975 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004976 }
4977
Victor Stinner8f674cc2013-04-17 23:02:17 +02004978 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004979 writer.min_length = size;
4980 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004981 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004982
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004983 writer.pos = ascii_decode(s, end, writer.data);
4984 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004985 while (s < end) {
4986 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004987 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004988
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004990 if (PyUnicode_IS_ASCII(writer.buffer))
4991 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004992 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004993 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004994 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004995 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004996 } else {
4997 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004998 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004999 }
5000
5001 switch (ch) {
5002 case 0:
5003 if (s == end || consumed)
5004 goto End;
5005 errmsg = "unexpected end of data";
5006 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005007 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005008 break;
5009 case 1:
5010 errmsg = "invalid start byte";
5011 startinpos = s - starts;
5012 endinpos = startinpos + 1;
5013 break;
5014 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005015 case 3:
5016 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005017 errmsg = "invalid continuation byte";
5018 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005019 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005020 break;
5021 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005022 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005023 goto onError;
5024 continue;
5025 }
5026
Victor Stinner1d65d912015-10-05 13:43:50 +02005027 if (error_handler == _Py_ERROR_UNKNOWN)
5028 error_handler = get_error_handler(errors);
5029
5030 switch (error_handler) {
5031 case _Py_ERROR_IGNORE:
5032 s += (endinpos - startinpos);
5033 break;
5034
5035 case _Py_ERROR_REPLACE:
5036 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5037 goto onError;
5038 s += (endinpos - startinpos);
5039 break;
5040
5041 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005042 {
5043 Py_ssize_t i;
5044
Victor Stinner1d65d912015-10-05 13:43:50 +02005045 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5046 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005047 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005048 ch = (Py_UCS4)(unsigned char)(starts[i]);
5049 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5050 ch + 0xdc00);
5051 writer.pos++;
5052 }
5053 s += (endinpos - startinpos);
5054 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005055 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005056
5057 default:
5058 if (unicode_decode_call_errorhandler_writer(
5059 errors, &error_handler_obj,
5060 "utf-8", errmsg,
5061 &starts, &end, &startinpos, &endinpos, &exc, &s,
5062 &writer))
5063 goto onError;
5064 }
Victor Stinner785938e2011-12-11 20:09:03 +01005065 }
5066
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005067End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005068 if (consumed)
5069 *consumed = s - starts;
5070
Victor Stinner1d65d912015-10-05 13:43:50 +02005071 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005073 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005074
5075onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005076 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005077 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005078 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005079 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005080}
5081
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005082#ifdef __APPLE__
5083
5084/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005085 used to decode the command line arguments on Mac OS X.
5086
5087 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005088 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005089
5090wchar_t*
5091_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5092{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005093 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005094 wchar_t *unicode;
5095 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005096
5097 /* Note: size will always be longer than the resulting Unicode
5098 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005099 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005100 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005101 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005102 if (!unicode)
5103 return NULL;
5104
5105 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005106 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005107 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005108 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005109 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005110#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005111 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005112#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005113 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005114#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005115 if (ch > 0xFF) {
5116#if SIZEOF_WCHAR_T == 4
5117 assert(0);
5118#else
5119 assert(Py_UNICODE_IS_SURROGATE(ch));
5120 /* compute and append the two surrogates: */
5121 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5122 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5123#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005124 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005125 else {
5126 if (!ch && s == e)
5127 break;
5128 /* surrogateescape */
5129 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5130 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005131 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005132 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005133 return unicode;
5134}
5135
5136#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005138/* Primary internal function which creates utf8 encoded bytes objects.
5139
5140 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005141 and allocate exactly as much space needed at the end. Else allocate the
5142 maximum possible needed (4 result bytes per Unicode character), and return
5143 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005144*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005145PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005146_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147{
Victor Stinner6099a032011-12-18 14:22:26 +01005148 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005149 void *data;
5150 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005152 if (!PyUnicode_Check(unicode)) {
5153 PyErr_BadArgument();
5154 return NULL;
5155 }
5156
5157 if (PyUnicode_READY(unicode) == -1)
5158 return NULL;
5159
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005160 if (PyUnicode_UTF8(unicode))
5161 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5162 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005163
5164 kind = PyUnicode_KIND(unicode);
5165 data = PyUnicode_DATA(unicode);
5166 size = PyUnicode_GET_LENGTH(unicode);
5167
Benjamin Petersonead6b532011-12-20 17:23:42 -06005168 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005169 default:
5170 assert(0);
5171 case PyUnicode_1BYTE_KIND:
5172 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5173 assert(!PyUnicode_IS_ASCII(unicode));
5174 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5175 case PyUnicode_2BYTE_KIND:
5176 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5177 case PyUnicode_4BYTE_KIND:
5178 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180}
5181
Alexander Belopolsky40018472011-02-26 01:02:56 +00005182PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005183PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5184 Py_ssize_t size,
5185 const char *errors)
5186{
5187 PyObject *v, *unicode;
5188
5189 unicode = PyUnicode_FromUnicode(s, size);
5190 if (unicode == NULL)
5191 return NULL;
5192 v = _PyUnicode_AsUTF8String(unicode, errors);
5193 Py_DECREF(unicode);
5194 return v;
5195}
5196
5197PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005198PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005200 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201}
5202
Walter Dörwald41980ca2007-08-16 21:55:45 +00005203/* --- UTF-32 Codec ------------------------------------------------------- */
5204
5205PyObject *
5206PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005207 Py_ssize_t size,
5208 const char *errors,
5209 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005210{
5211 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5212}
5213
5214PyObject *
5215PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005216 Py_ssize_t size,
5217 const char *errors,
5218 int *byteorder,
5219 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005220{
5221 const char *starts = s;
5222 Py_ssize_t startinpos;
5223 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005224 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005225 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005226 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005227 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005228 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005229 PyObject *errorHandler = NULL;
5230 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005231
Walter Dörwald41980ca2007-08-16 21:55:45 +00005232 q = (unsigned char *)s;
5233 e = q + size;
5234
5235 if (byteorder)
5236 bo = *byteorder;
5237
5238 /* Check for BOM marks (U+FEFF) in the input and adjust current
5239 byte order setting accordingly. In native mode, the leading BOM
5240 mark is skipped, in all other modes, it is copied to the output
5241 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005242 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005243 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005244 if (bom == 0x0000FEFF) {
5245 bo = -1;
5246 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005247 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005248 else if (bom == 0xFFFE0000) {
5249 bo = 1;
5250 q += 4;
5251 }
5252 if (byteorder)
5253 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005254 }
5255
Victor Stinnere64322e2012-10-30 23:12:47 +01005256 if (q == e) {
5257 if (consumed)
5258 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005259 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005260 }
5261
Victor Stinnere64322e2012-10-30 23:12:47 +01005262#ifdef WORDS_BIGENDIAN
5263 le = bo < 0;
5264#else
5265 le = bo <= 0;
5266#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005267 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005268
Victor Stinner8f674cc2013-04-17 23:02:17 +02005269 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005270 writer.min_length = (e - q + 3) / 4;
5271 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005272 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005273
Victor Stinnere64322e2012-10-30 23:12:47 +01005274 while (1) {
5275 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005276 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005277
Victor Stinnere64322e2012-10-30 23:12:47 +01005278 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005279 enum PyUnicode_Kind kind = writer.kind;
5280 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005281 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005282 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005283 if (le) {
5284 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005285 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005286 if (ch > maxch)
5287 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005288 if (kind != PyUnicode_1BYTE_KIND &&
5289 Py_UNICODE_IS_SURROGATE(ch))
5290 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005291 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005292 q += 4;
5293 } while (q <= last);
5294 }
5295 else {
5296 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005297 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005298 if (ch > maxch)
5299 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005300 if (kind != PyUnicode_1BYTE_KIND &&
5301 Py_UNICODE_IS_SURROGATE(ch))
5302 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005303 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005304 q += 4;
5305 } while (q <= last);
5306 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005307 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005308 }
5309
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005310 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005311 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005312 startinpos = ((const char *)q) - starts;
5313 endinpos = startinpos + 4;
5314 }
5315 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005316 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005317 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005318 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005319 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005320 startinpos = ((const char *)q) - starts;
5321 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005323 else {
5324 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005325 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005326 goto onError;
5327 q += 4;
5328 continue;
5329 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005330 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005331 startinpos = ((const char *)q) - starts;
5332 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005334
5335 /* The remaining input chars are ignored if the callback
5336 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005337 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005339 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005341 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005343 }
5344
Walter Dörwald41980ca2007-08-16 21:55:45 +00005345 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005347
Walter Dörwald41980ca2007-08-16 21:55:45 +00005348 Py_XDECREF(errorHandler);
5349 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005350 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005351
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005353 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005354 Py_XDECREF(errorHandler);
5355 Py_XDECREF(exc);
5356 return NULL;
5357}
5358
5359PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005360_PyUnicode_EncodeUTF32(PyObject *str,
5361 const char *errors,
5362 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005363{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005364 enum PyUnicode_Kind kind;
5365 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005366 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005367 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005368 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005369#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005370 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005371#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005372 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005373#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005374 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005375 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005376 PyObject *errorHandler = NULL;
5377 PyObject *exc = NULL;
5378 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005379
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005380 if (!PyUnicode_Check(str)) {
5381 PyErr_BadArgument();
5382 return NULL;
5383 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005384 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005385 return NULL;
5386 kind = PyUnicode_KIND(str);
5387 data = PyUnicode_DATA(str);
5388 len = PyUnicode_GET_LENGTH(str);
5389
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005390 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005391 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005392 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005393 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005394 if (v == NULL)
5395 return NULL;
5396
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005397 /* output buffer is 4-bytes aligned */
5398 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005399 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005400 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005401 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005402 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005403 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005404
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005405 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005406 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005407 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005408 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005409 else
5410 encoding = "utf-32";
5411
5412 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005413 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5414 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005415 }
5416
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005417 pos = 0;
5418 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005419 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005420
5421 if (kind == PyUnicode_2BYTE_KIND) {
5422 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5423 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005424 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005425 else {
5426 assert(kind == PyUnicode_4BYTE_KIND);
5427 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5428 &out, native_ordering);
5429 }
5430 if (pos == len)
5431 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005432
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005433 rep = unicode_encode_call_errorhandler(
5434 errors, &errorHandler,
5435 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005436 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005437 if (!rep)
5438 goto error;
5439
5440 if (PyBytes_Check(rep)) {
5441 repsize = PyBytes_GET_SIZE(rep);
5442 if (repsize & 3) {
5443 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005444 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005445 "surrogates not allowed");
5446 goto error;
5447 }
5448 moreunits = repsize / 4;
5449 }
5450 else {
5451 assert(PyUnicode_Check(rep));
5452 if (PyUnicode_READY(rep) < 0)
5453 goto error;
5454 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5455 if (!PyUnicode_IS_ASCII(rep)) {
5456 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005457 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005458 "surrogates not allowed");
5459 goto error;
5460 }
5461 }
5462
5463 /* four bytes are reserved for each surrogate */
5464 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005465 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005466 Py_ssize_t morebytes = 4 * (moreunits - 1);
5467 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5468 /* integer overflow */
5469 PyErr_NoMemory();
5470 goto error;
5471 }
5472 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5473 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005474 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005475 }
5476
5477 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005478 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005479 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005480 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005481 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005482 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5483 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005484 }
5485
5486 Py_CLEAR(rep);
5487 }
5488
5489 /* Cut back to size actually needed. This is necessary for, for example,
5490 encoding of a string containing isolated surrogates and the 'ignore'
5491 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005492 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005493 if (nsize != PyBytes_GET_SIZE(v))
5494 _PyBytes_Resize(&v, nsize);
5495 Py_XDECREF(errorHandler);
5496 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005497 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005498 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005499 error:
5500 Py_XDECREF(rep);
5501 Py_XDECREF(errorHandler);
5502 Py_XDECREF(exc);
5503 Py_XDECREF(v);
5504 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005505}
5506
Alexander Belopolsky40018472011-02-26 01:02:56 +00005507PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005508PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5509 Py_ssize_t size,
5510 const char *errors,
5511 int byteorder)
5512{
5513 PyObject *result;
5514 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5515 if (tmp == NULL)
5516 return NULL;
5517 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5518 Py_DECREF(tmp);
5519 return result;
5520}
5521
5522PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005523PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005524{
Victor Stinnerb960b342011-11-20 19:12:52 +01005525 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005526}
5527
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528/* --- UTF-16 Codec ------------------------------------------------------- */
5529
Tim Peters772747b2001-08-09 22:21:55 +00005530PyObject *
5531PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005532 Py_ssize_t size,
5533 const char *errors,
5534 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535{
Walter Dörwald69652032004-09-07 20:24:22 +00005536 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5537}
5538
5539PyObject *
5540PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005541 Py_ssize_t size,
5542 const char *errors,
5543 int *byteorder,
5544 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005545{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005546 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005547 Py_ssize_t startinpos;
5548 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005549 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005550 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005551 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005552 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005553 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005554 PyObject *errorHandler = NULL;
5555 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005556 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557
Tim Peters772747b2001-08-09 22:21:55 +00005558 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005559 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560
5561 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005562 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005564 /* Check for BOM marks (U+FEFF) in the input and adjust current
5565 byte order setting accordingly. In native mode, the leading BOM
5566 mark is skipped, in all other modes, it is copied to the output
5567 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005568 if (bo == 0 && size >= 2) {
5569 const Py_UCS4 bom = (q[1] << 8) | q[0];
5570 if (bom == 0xFEFF) {
5571 q += 2;
5572 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005573 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005574 else if (bom == 0xFFFE) {
5575 q += 2;
5576 bo = 1;
5577 }
5578 if (byteorder)
5579 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005580 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581
Antoine Pitrou63065d72012-05-15 23:48:04 +02005582 if (q == e) {
5583 if (consumed)
5584 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005585 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005586 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005587
Christian Heimes743e0cd2012-10-17 23:52:17 +02005588#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005589 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005590 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005591#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005592 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005593 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005594#endif
Tim Peters772747b2001-08-09 22:21:55 +00005595
Antoine Pitrou63065d72012-05-15 23:48:04 +02005596 /* Note: size will always be longer than the resulting Unicode
5597 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005598 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005599 writer.min_length = (e - q + 1) / 2;
5600 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005601 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005602
Antoine Pitrou63065d72012-05-15 23:48:04 +02005603 while (1) {
5604 Py_UCS4 ch = 0;
5605 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005606 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005607 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005608 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005609 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005610 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611 native_ordering);
5612 else
5613 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005614 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005615 native_ordering);
5616 } else if (kind == PyUnicode_2BYTE_KIND) {
5617 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005618 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005619 native_ordering);
5620 } else {
5621 assert(kind == PyUnicode_4BYTE_KIND);
5622 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005623 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005624 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005625 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005626 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005627
Antoine Pitrou63065d72012-05-15 23:48:04 +02005628 switch (ch)
5629 {
5630 case 0:
5631 /* remaining byte at the end? (size should be even) */
5632 if (q == e || consumed)
5633 goto End;
5634 errmsg = "truncated data";
5635 startinpos = ((const char *)q) - starts;
5636 endinpos = ((const char *)e) - starts;
5637 break;
5638 /* The remaining input chars are ignored if the callback
5639 chooses to skip the input */
5640 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005641 q -= 2;
5642 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005643 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005644 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005645 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005646 endinpos = ((const char *)e) - starts;
5647 break;
5648 case 2:
5649 errmsg = "illegal encoding";
5650 startinpos = ((const char *)q) - 2 - starts;
5651 endinpos = startinpos + 2;
5652 break;
5653 case 3:
5654 errmsg = "illegal UTF-16 surrogate";
5655 startinpos = ((const char *)q) - 4 - starts;
5656 endinpos = startinpos + 2;
5657 break;
5658 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005659 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005660 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 continue;
5662 }
5663
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005664 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005665 errors,
5666 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005667 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005668 &starts,
5669 (const char **)&e,
5670 &startinpos,
5671 &endinpos,
5672 &exc,
5673 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005674 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005675 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 }
5677
Antoine Pitrou63065d72012-05-15 23:48:04 +02005678End:
Walter Dörwald69652032004-09-07 20:24:22 +00005679 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005680 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005681
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005682 Py_XDECREF(errorHandler);
5683 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005684 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005687 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005688 Py_XDECREF(errorHandler);
5689 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 return NULL;
5691}
5692
Tim Peters772747b2001-08-09 22:21:55 +00005693PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005694_PyUnicode_EncodeUTF16(PyObject *str,
5695 const char *errors,
5696 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005698 enum PyUnicode_Kind kind;
5699 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005700 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005701 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005702 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005703 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005704#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005705 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005706#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005707 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005708#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005709 const char *encoding;
5710 Py_ssize_t nsize, pos;
5711 PyObject *errorHandler = NULL;
5712 PyObject *exc = NULL;
5713 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005714
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005715 if (!PyUnicode_Check(str)) {
5716 PyErr_BadArgument();
5717 return NULL;
5718 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005719 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005720 return NULL;
5721 kind = PyUnicode_KIND(str);
5722 data = PyUnicode_DATA(str);
5723 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005724
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005725 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005726 if (kind == PyUnicode_4BYTE_KIND) {
5727 const Py_UCS4 *in = (const Py_UCS4 *)data;
5728 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005729 while (in < end) {
5730 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005731 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005732 }
5733 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005734 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005735 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005737 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005738 nsize = len + pairs + (byteorder == 0);
5739 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005740 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005742 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005744 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005745 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005746 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005747 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005748 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005749 }
5750 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005751 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005752 }
Tim Peters772747b2001-08-09 22:21:55 +00005753
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005754 if (kind == PyUnicode_1BYTE_KIND) {
5755 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5756 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005757 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005758
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005759 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005760 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005761 }
5762 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005763 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005764 }
5765 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005766 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005767 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005768
5769 pos = 0;
5770 while (pos < len) {
5771 Py_ssize_t repsize, moreunits;
5772
5773 if (kind == PyUnicode_2BYTE_KIND) {
5774 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5775 &out, native_ordering);
5776 }
5777 else {
5778 assert(kind == PyUnicode_4BYTE_KIND);
5779 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5780 &out, native_ordering);
5781 }
5782 if (pos == len)
5783 break;
5784
5785 rep = unicode_encode_call_errorhandler(
5786 errors, &errorHandler,
5787 encoding, "surrogates not allowed",
5788 str, &exc, pos, pos + 1, &pos);
5789 if (!rep)
5790 goto error;
5791
5792 if (PyBytes_Check(rep)) {
5793 repsize = PyBytes_GET_SIZE(rep);
5794 if (repsize & 1) {
5795 raise_encode_exception(&exc, encoding,
5796 str, pos - 1, pos,
5797 "surrogates not allowed");
5798 goto error;
5799 }
5800 moreunits = repsize / 2;
5801 }
5802 else {
5803 assert(PyUnicode_Check(rep));
5804 if (PyUnicode_READY(rep) < 0)
5805 goto error;
5806 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5807 if (!PyUnicode_IS_ASCII(rep)) {
5808 raise_encode_exception(&exc, encoding,
5809 str, pos - 1, pos,
5810 "surrogates not allowed");
5811 goto error;
5812 }
5813 }
5814
5815 /* two bytes are reserved for each surrogate */
5816 if (moreunits > 1) {
5817 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5818 Py_ssize_t morebytes = 2 * (moreunits - 1);
5819 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5820 /* integer overflow */
5821 PyErr_NoMemory();
5822 goto error;
5823 }
5824 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5825 goto error;
5826 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5827 }
5828
5829 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005830 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005831 out += moreunits;
5832 } else /* rep is unicode */ {
5833 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5834 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5835 &out, native_ordering);
5836 }
5837
5838 Py_CLEAR(rep);
5839 }
5840
5841 /* Cut back to size actually needed. This is necessary for, for example,
5842 encoding of a string containing isolated surrogates and the 'ignore' handler
5843 is used. */
5844 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5845 if (nsize != PyBytes_GET_SIZE(v))
5846 _PyBytes_Resize(&v, nsize);
5847 Py_XDECREF(errorHandler);
5848 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005849 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005850 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005851 error:
5852 Py_XDECREF(rep);
5853 Py_XDECREF(errorHandler);
5854 Py_XDECREF(exc);
5855 Py_XDECREF(v);
5856 return NULL;
5857#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858}
5859
Alexander Belopolsky40018472011-02-26 01:02:56 +00005860PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005861PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5862 Py_ssize_t size,
5863 const char *errors,
5864 int byteorder)
5865{
5866 PyObject *result;
5867 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5868 if (tmp == NULL)
5869 return NULL;
5870 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5871 Py_DECREF(tmp);
5872 return result;
5873}
5874
5875PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005876PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005878 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879}
5880
5881/* --- Unicode Escape Codec ----------------------------------------------- */
5882
Fredrik Lundh06d12682001-01-24 07:59:11 +00005883static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005884
Alexander Belopolsky40018472011-02-26 01:02:56 +00005885PyObject *
5886PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005887 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005888 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005890 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005891 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005893 PyObject *errorHandler = NULL;
5894 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005895
Victor Stinner62ec3312016-09-06 17:04:34 -07005896 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005897 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005898 }
5899 /* Escaped strings will always be longer than the resulting
5900 Unicode string, so we start with size here and then reduce the
5901 length after conversion to the true value.
5902 (but if the error callback returns a long replacement string
5903 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005904 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005905 writer.min_length = size;
5906 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5907 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005908 }
5909
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 end = s + size;
5911 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005912 unsigned char c = (unsigned char) *s++;
5913 Py_UCS4 ch;
5914 int count;
5915 Py_ssize_t startinpos;
5916 Py_ssize_t endinpos;
5917 const char *message;
5918
5919#define WRITE_ASCII_CHAR(ch) \
5920 do { \
5921 assert(ch <= 127); \
5922 assert(writer.pos < writer.size); \
5923 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5924 } while(0)
5925
5926#define WRITE_CHAR(ch) \
5927 do { \
5928 if (ch <= writer.maxchar) { \
5929 assert(writer.pos < writer.size); \
5930 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5931 } \
5932 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5933 goto onError; \
5934 } \
5935 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936
5937 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005938 if (c != '\\') {
5939 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 continue;
5941 }
5942
Victor Stinner62ec3312016-09-06 17:04:34 -07005943 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005945 if (s >= end) {
5946 message = "\\ at end of string";
5947 goto error;
5948 }
5949 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005950
Victor Stinner62ec3312016-09-06 17:04:34 -07005951 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005952 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005955 case '\n': continue;
5956 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5957 case '\'': WRITE_ASCII_CHAR('\''); continue;
5958 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5959 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005960 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005961 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5962 case 't': WRITE_ASCII_CHAR('\t'); continue;
5963 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5964 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005965 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005966 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005967 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005968 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 case '0': case '1': case '2': case '3':
5972 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005973 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005974 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005975 ch = (ch<<3) + *s++ - '0';
5976 if (s < end && '0' <= *s && *s <= '7') {
5977 ch = (ch<<3) + *s++ - '0';
5978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005980 WRITE_CHAR(ch);
5981 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 /* hex escapes */
5984 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07005986 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005987 message = "truncated \\xXX escape";
5988 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989
Benjamin Peterson29060642009-01-31 22:14:21 +00005990 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07005992 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005993 message = "truncated \\uXXXX escape";
5994 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005997 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07005998 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005999 message = "truncated \\UXXXXXXXX escape";
6000 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006001 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006002 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006003 ch <<= 4;
6004 if (c >= '0' && c <= '9') {
6005 ch += c - '0';
6006 }
6007 else if (c >= 'a' && c <= 'f') {
6008 ch += c - ('a' - 10);
6009 }
6010 else if (c >= 'A' && c <= 'F') {
6011 ch += c - ('A' - 10);
6012 }
6013 else {
6014 break;
6015 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006016 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006017 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006018 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006019 }
6020
6021 /* when we get here, ch is a 32-bit unicode character */
6022 if (ch > MAX_UNICODE) {
6023 message = "illegal Unicode character";
6024 goto error;
6025 }
6026
6027 WRITE_CHAR(ch);
6028 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006029
Benjamin Peterson29060642009-01-31 22:14:21 +00006030 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006031 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006032 if (ucnhash_CAPI == NULL) {
6033 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006034 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6035 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006036 if (ucnhash_CAPI == NULL) {
6037 PyErr_SetString(
6038 PyExc_UnicodeError,
6039 "\\N escapes not supported (can't load unicodedata module)"
6040 );
6041 goto onError;
6042 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006043 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006044
6045 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006046 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006047 const char *start = ++s;
6048 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006049 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006050 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006051 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006052 namelen = s - start;
6053 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006054 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006055 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006056 ch = 0xffffffff; /* in case 'getcode' messes up */
6057 if (namelen <= INT_MAX &&
6058 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6059 &ch, 0)) {
6060 assert(ch <= MAX_UNICODE);
6061 WRITE_CHAR(ch);
6062 continue;
6063 }
6064 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006065 }
6066 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006067 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006068
6069 default:
R David Murray110b6fe2016-09-08 15:34:08 -04006070 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6071 "invalid escape sequence '\\%c'", c) < 0)
6072 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006073 WRITE_ASCII_CHAR('\\');
6074 WRITE_CHAR(c);
6075 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006077
6078 error:
6079 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006080 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006081 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006082 errors, &errorHandler,
6083 "unicodeescape", message,
6084 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006085 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006086 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006087 }
6088 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6089 goto onError;
6090 }
6091
6092#undef WRITE_ASCII_CHAR
6093#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006095
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006096 Py_XDECREF(errorHandler);
6097 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006098 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006099
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006101 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006102 Py_XDECREF(errorHandler);
6103 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 return NULL;
6105}
6106
6107/* Return a Unicode-Escape string version of the Unicode object.
6108
6109 If quotes is true, the string is enclosed in u"" or u'' quotes as
6110 appropriate.
6111
6112*/
6113
Alexander Belopolsky40018472011-02-26 01:02:56 +00006114PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006115PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006117 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006118 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006120 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006121 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006122 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123
Ezio Melottie7f90372012-10-05 03:33:31 +03006124 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006125 escape.
6126
Ezio Melottie7f90372012-10-05 03:33:31 +03006127 For UCS1 strings it's '\xxx', 4 bytes per source character.
6128 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6129 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006130 */
6131
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006132 if (!PyUnicode_Check(unicode)) {
6133 PyErr_BadArgument();
6134 return NULL;
6135 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006136 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006137 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006138 }
Victor Stinner358af132015-10-12 22:36:57 +02006139
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006140 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006141 if (len == 0) {
6142 return PyBytes_FromStringAndSize(NULL, 0);
6143 }
6144
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006145 kind = PyUnicode_KIND(unicode);
6146 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006147 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6148 bytes, and 1 byte characters 4. */
6149 expandsize = kind * 2 + 2;
6150 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) {
6151 return PyErr_NoMemory();
6152 }
6153 repr = PyBytes_FromStringAndSize(NULL, 2 + expandsize * len + 1);
6154 if (repr == NULL) {
6155 return NULL;
6156 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006157
Victor Stinner62ec3312016-09-06 17:04:34 -07006158 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006160 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006161
Victor Stinner62ec3312016-09-06 17:04:34 -07006162 /* U+0000-U+00ff range */
6163 if (ch < 0x100) {
6164 if (ch >= ' ' && ch < 127) {
6165 if (ch != '\\') {
6166 /* Copy printable US ASCII as-is */
6167 *p++ = (char) ch;
6168 }
6169 /* Escape backslashes */
6170 else {
6171 *p++ = '\\';
6172 *p++ = '\\';
6173 }
6174 }
Victor Stinner358af132015-10-12 22:36:57 +02006175
Victor Stinner62ec3312016-09-06 17:04:34 -07006176 /* Map special whitespace to '\t', \n', '\r' */
6177 else if (ch == '\t') {
6178 *p++ = '\\';
6179 *p++ = 't';
6180 }
6181 else if (ch == '\n') {
6182 *p++ = '\\';
6183 *p++ = 'n';
6184 }
6185 else if (ch == '\r') {
6186 *p++ = '\\';
6187 *p++ = 'r';
6188 }
6189
6190 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6191 else {
6192 *p++ = '\\';
6193 *p++ = 'x';
6194 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6195 *p++ = Py_hexdigits[ch & 0x000F];
6196 }
Tim Petersced69f82003-09-16 20:30:58 +00006197 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006198 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6199 else if (ch < 0x10000) {
6200 /* U+0100-U+ffff */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 *p++ = '\\';
6202 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006203 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6204 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6205 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6206 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006208 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6209 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006210
Victor Stinner62ec3312016-09-06 17:04:34 -07006211 /* Make sure that the first two digits are zero */
6212 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006213 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006214 *p++ = 'U';
6215 *p++ = '0';
6216 *p++ = '0';
6217 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6218 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6219 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6220 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6221 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6222 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225
Victor Stinner62ec3312016-09-06 17:04:34 -07006226 assert(p - PyBytes_AS_STRING(repr) > 0);
6227 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6228 return NULL;
6229 }
6230 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231}
6232
Alexander Belopolsky40018472011-02-26 01:02:56 +00006233PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006234PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6235 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006237 PyObject *result;
6238 PyObject *tmp = PyUnicode_FromUnicode(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006239 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006241 }
6242
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006243 result = PyUnicode_AsUnicodeEscapeString(tmp);
6244 Py_DECREF(tmp);
6245 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246}
6247
6248/* --- Raw Unicode Escape Codec ------------------------------------------- */
6249
Alexander Belopolsky40018472011-02-26 01:02:56 +00006250PyObject *
6251PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006252 Py_ssize_t size,
6253 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006255 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006256 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006258 PyObject *errorHandler = NULL;
6259 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006260
Victor Stinner62ec3312016-09-06 17:04:34 -07006261 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006262 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006263 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006264
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 /* Escaped strings will always be longer than the resulting
6266 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006267 length after conversion to the true value. (But decoding error
6268 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006269 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006270 writer.min_length = size;
6271 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6272 goto onError;
6273 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006274
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 end = s + size;
6276 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006277 unsigned char c = (unsigned char) *s++;
6278 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006279 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006280 Py_ssize_t startinpos;
6281 Py_ssize_t endinpos;
6282 const char *message;
6283
6284#define WRITE_CHAR(ch) \
6285 do { \
6286 if (ch <= writer.maxchar) { \
6287 assert(writer.pos < writer.size); \
6288 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6289 } \
6290 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6291 goto onError; \
6292 } \
6293 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294
Benjamin Peterson29060642009-01-31 22:14:21 +00006295 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006296 if (c != '\\' || s >= end) {
6297 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006299 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006300
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 c = (unsigned char) *s++;
6302 if (c == 'u') {
6303 count = 4;
6304 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006305 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006306 else if (c == 'U') {
6307 count = 8;
6308 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006309 }
6310 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 assert(writer.pos < writer.size);
6312 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6313 WRITE_CHAR(c);
6314 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006315 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006316 startinpos = s - starts - 2;
6317
6318 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6319 for (ch = 0; count && s < end; ++s, --count) {
6320 c = (unsigned char)*s;
6321 ch <<= 4;
6322 if (c >= '0' && c <= '9') {
6323 ch += c - '0';
6324 }
6325 else if (c >= 'a' && c <= 'f') {
6326 ch += c - ('a' - 10);
6327 }
6328 else if (c >= 'A' && c <= 'F') {
6329 ch += c - ('A' - 10);
6330 }
6331 else {
6332 break;
6333 }
6334 }
6335 if (!count) {
6336 if (ch <= MAX_UNICODE) {
6337 WRITE_CHAR(ch);
6338 continue;
6339 }
6340 message = "\\Uxxxxxxxx out of range";
6341 }
6342
6343 endinpos = s-starts;
6344 writer.min_length = end - s + writer.pos;
6345 if (unicode_decode_call_errorhandler_writer(
6346 errors, &errorHandler,
6347 "rawunicodeescape", message,
6348 &starts, &end, &startinpos, &endinpos, &exc, &s,
6349 &writer)) {
6350 goto onError;
6351 }
6352 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6353 goto onError;
6354 }
6355
6356#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006358 Py_XDECREF(errorHandler);
6359 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006360 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006361
Benjamin Peterson29060642009-01-31 22:14:21 +00006362 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006363 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006364 Py_XDECREF(errorHandler);
6365 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006367
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368}
6369
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006370
Alexander Belopolsky40018472011-02-26 01:02:56 +00006371PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006372PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373{
Victor Stinner62ec3312016-09-06 17:04:34 -07006374 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006376 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006377 int kind;
6378 void *data;
6379 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006381 if (!PyUnicode_Check(unicode)) {
6382 PyErr_BadArgument();
6383 return NULL;
6384 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006385 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006386 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006387 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006388 kind = PyUnicode_KIND(unicode);
6389 data = PyUnicode_DATA(unicode);
6390 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006391 if (kind == PyUnicode_1BYTE_KIND) {
6392 return PyBytes_FromStringAndSize(data, len);
6393 }
Victor Stinner0e368262011-11-10 20:12:49 +01006394
Victor Stinner62ec3312016-09-06 17:04:34 -07006395 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6396 bytes, and 1 byte characters 4. */
6397 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006398
Victor Stinner62ec3312016-09-06 17:04:34 -07006399 if (len > PY_SSIZE_T_MAX / expandsize) {
6400 return PyErr_NoMemory();
6401 }
6402 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6403 if (repr == NULL) {
6404 return NULL;
6405 }
6406 if (len == 0) {
6407 return repr;
6408 }
6409
6410 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411 for (pos = 0; pos < len; pos++) {
6412 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006413
Victor Stinner62ec3312016-09-06 17:04:34 -07006414 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6415 if (ch < 0x100) {
6416 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006417 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006418 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6419 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 *p++ = '\\';
6421 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006422 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6423 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6424 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6425 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006427 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6428 else {
6429 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6430 *p++ = '\\';
6431 *p++ = 'U';
6432 *p++ = '0';
6433 *p++ = '0';
6434 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6435 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6436 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6437 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6438 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6439 *p++ = Py_hexdigits[ch & 15];
6440 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006442
Victor Stinner62ec3312016-09-06 17:04:34 -07006443 assert(p > PyBytes_AS_STRING(repr));
6444 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6445 return NULL;
6446 }
6447 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448}
6449
Alexander Belopolsky40018472011-02-26 01:02:56 +00006450PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006451PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6452 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006454 PyObject *result;
6455 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6456 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006457 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006458 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6459 Py_DECREF(tmp);
6460 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461}
6462
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006463/* --- Unicode Internal Codec ------------------------------------------- */
6464
Alexander Belopolsky40018472011-02-26 01:02:56 +00006465PyObject *
6466_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006467 Py_ssize_t size,
6468 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006469{
6470 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006471 Py_ssize_t startinpos;
6472 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006473 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006474 const char *end;
6475 const char *reason;
6476 PyObject *errorHandler = NULL;
6477 PyObject *exc = NULL;
6478
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006479 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006480 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006481 1))
6482 return NULL;
6483
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006484 if (size == 0)
6485 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006486
Victor Stinner8f674cc2013-04-17 23:02:17 +02006487 _PyUnicodeWriter_Init(&writer);
6488 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6489 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006490 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006491 }
6492 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006493
Victor Stinner8f674cc2013-04-17 23:02:17 +02006494 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006495 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006496 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006497 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006498 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006499 endinpos = end-starts;
6500 reason = "truncated input";
6501 goto error;
6502 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006503 /* We copy the raw representation one byte at a time because the
6504 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006505 ((char *) &uch)[0] = s[0];
6506 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006507#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006508 ((char *) &uch)[2] = s[2];
6509 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006510#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006511 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006512#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006513 /* We have to sanity check the raw data, otherwise doom looms for
6514 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006515 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006516 endinpos = s - starts + Py_UNICODE_SIZE;
6517 reason = "illegal code point (> 0x10FFFF)";
6518 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006519 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006520#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006521 s += Py_UNICODE_SIZE;
6522#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006523 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006524 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006525 Py_UNICODE uch2;
6526 ((char *) &uch2)[0] = s[0];
6527 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006528 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006529 {
Victor Stinner551ac952011-11-29 22:58:13 +01006530 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006531 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006532 }
6533 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006534#endif
6535
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006536 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006537 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006538 continue;
6539
6540 error:
6541 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006542 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006543 errors, &errorHandler,
6544 "unicode_internal", reason,
6545 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006546 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006547 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006548 }
6549
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006550 Py_XDECREF(errorHandler);
6551 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006552 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006553
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006555 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006556 Py_XDECREF(errorHandler);
6557 Py_XDECREF(exc);
6558 return NULL;
6559}
6560
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561/* --- Latin-1 Codec ------------------------------------------------------ */
6562
Alexander Belopolsky40018472011-02-26 01:02:56 +00006563PyObject *
6564PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006565 Py_ssize_t size,
6566 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006569 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570}
6571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006572/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006573static void
6574make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006575 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006576 PyObject *unicode,
6577 Py_ssize_t startpos, Py_ssize_t endpos,
6578 const char *reason)
6579{
6580 if (*exceptionObject == NULL) {
6581 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006582 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006583 encoding, unicode, startpos, endpos, reason);
6584 }
6585 else {
6586 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6587 goto onError;
6588 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6589 goto onError;
6590 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6591 goto onError;
6592 return;
6593 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006594 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006595 }
6596}
6597
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006598/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006599static void
6600raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006601 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006602 PyObject *unicode,
6603 Py_ssize_t startpos, Py_ssize_t endpos,
6604 const char *reason)
6605{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006606 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006607 encoding, unicode, startpos, endpos, reason);
6608 if (*exceptionObject != NULL)
6609 PyCodec_StrictErrors(*exceptionObject);
6610}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006611
6612/* error handling callback helper:
6613 build arguments, call the callback and check the arguments,
6614 put the result into newpos and return the replacement string, which
6615 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006616static PyObject *
6617unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006618 PyObject **errorHandler,
6619 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006621 Py_ssize_t startpos, Py_ssize_t endpos,
6622 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006623{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006624 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006625 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006626 PyObject *restuple;
6627 PyObject *resunicode;
6628
6629 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006631 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006632 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006633 }
6634
Benjamin Petersonbac79492012-01-14 13:34:47 -05006635 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006636 return NULL;
6637 len = PyUnicode_GET_LENGTH(unicode);
6638
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006639 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006640 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006642 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006643
6644 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006646 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006648 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006649 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006650 Py_DECREF(restuple);
6651 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006652 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006653 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 &resunicode, newpos)) {
6655 Py_DECREF(restuple);
6656 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006657 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006658 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6659 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6660 Py_DECREF(restuple);
6661 return NULL;
6662 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006663 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006664 *newpos = len + *newpos;
6665 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006666 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 Py_DECREF(restuple);
6668 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006669 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006670 Py_INCREF(resunicode);
6671 Py_DECREF(restuple);
6672 return resunicode;
6673}
6674
Alexander Belopolsky40018472011-02-26 01:02:56 +00006675static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006676unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006677 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006678 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006679{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006680 /* input state */
6681 Py_ssize_t pos=0, size;
6682 int kind;
6683 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 /* pointer into the output */
6685 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006686 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6687 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006688 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006689 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006690 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006691 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006692 /* output object */
6693 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694
Benjamin Petersonbac79492012-01-14 13:34:47 -05006695 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006696 return NULL;
6697 size = PyUnicode_GET_LENGTH(unicode);
6698 kind = PyUnicode_KIND(unicode);
6699 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006700 /* allocate enough for a simple encoding without
6701 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006702 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006703 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006704
6705 _PyBytesWriter_Init(&writer);
6706 str = _PyBytesWriter_Alloc(&writer, size);
6707 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006708 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006710 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006711 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006712
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006714 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006715 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006716 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006717 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006718 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006720 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006722 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006723 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006725
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006726 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006728
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006729 /* Only overallocate the buffer if it's not the last write */
6730 writer.overallocate = (collend < size);
6731
Benjamin Peterson29060642009-01-31 22:14:21 +00006732 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006733 if (error_handler == _Py_ERROR_UNKNOWN)
6734 error_handler = get_error_handler(errors);
6735
6736 switch (error_handler) {
6737 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006738 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006740
6741 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006742 memset(str, '?', collend - collstart);
6743 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006744 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006745 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006746 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006747 break;
Victor Stinner50149202015-09-22 00:26:54 +02006748
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006749 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006750 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006751 writer.min_size -= (collend - collstart);
6752 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006753 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006754 if (str == NULL)
6755 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006756 pos = collend;
6757 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006758
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006759 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006760 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006761 writer.min_size -= (collend - collstart);
6762 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006763 unicode, collstart, collend);
6764 if (str == NULL)
6765 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006766 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 break;
Victor Stinner50149202015-09-22 00:26:54 +02006768
Victor Stinnerc3713e92015-09-29 12:32:13 +02006769 case _Py_ERROR_SURROGATEESCAPE:
6770 for (i = collstart; i < collend; ++i) {
6771 ch = PyUnicode_READ(kind, data, i);
6772 if (ch < 0xdc80 || 0xdcff < ch) {
6773 /* Not a UTF-8b surrogate */
6774 break;
6775 }
6776 *str++ = (char)(ch - 0xdc00);
6777 ++pos;
6778 }
6779 if (i >= collend)
6780 break;
6781 collstart = pos;
6782 assert(collstart != collend);
6783 /* fallback to general error handling */
6784
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006786 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6787 encoding, reason, unicode, &exc,
6788 collstart, collend, &newpos);
6789 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006791
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006792 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006793 writer.min_size -= 1;
6794
Victor Stinner6bd525b2015-10-09 13:10:05 +02006795 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006796 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006797 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006798 PyBytes_AS_STRING(rep),
6799 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006800 if (str == NULL)
6801 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006802 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006803 else {
6804 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006805
Victor Stinner6bd525b2015-10-09 13:10:05 +02006806 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006808
6809 if (PyUnicode_IS_ASCII(rep)) {
6810 /* Fast path: all characters are smaller than limit */
6811 assert(limit >= 128);
6812 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6813 str = _PyBytesWriter_WriteBytes(&writer, str,
6814 PyUnicode_DATA(rep),
6815 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006816 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006817 else {
6818 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6819
6820 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6821 if (str == NULL)
6822 goto onError;
6823
6824 /* check if there is anything unencodable in the
6825 replacement and copy it to the output */
6826 for (i = 0; repsize-->0; ++i, ++str) {
6827 ch = PyUnicode_READ_CHAR(rep, i);
6828 if (ch >= limit) {
6829 raise_encode_exception(&exc, encoding, unicode,
6830 pos, pos+1, reason);
6831 goto onError;
6832 }
6833 *str = (char)ch;
6834 }
6835 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006836 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006837 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006838 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006839 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006840
6841 /* If overallocation was disabled, ensure that it was the last
6842 write. Otherwise, we missed an optimization */
6843 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006844 }
6845 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006846
Victor Stinner50149202015-09-22 00:26:54 +02006847 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006848 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006849 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006850
6851 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006852 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006853 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006854 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006855 Py_XDECREF(exc);
6856 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006857}
6858
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006859/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006860PyObject *
6861PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006862 Py_ssize_t size,
6863 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006865 PyObject *result;
6866 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6867 if (unicode == NULL)
6868 return NULL;
6869 result = unicode_encode_ucs1(unicode, errors, 256);
6870 Py_DECREF(unicode);
6871 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872}
6873
Alexander Belopolsky40018472011-02-26 01:02:56 +00006874PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006875_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876{
6877 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 PyErr_BadArgument();
6879 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006881 if (PyUnicode_READY(unicode) == -1)
6882 return NULL;
6883 /* Fast path: if it is a one-byte string, construct
6884 bytes object directly. */
6885 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6886 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6887 PyUnicode_GET_LENGTH(unicode));
6888 /* Non-Latin-1 characters present. Defer to above function to
6889 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006890 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006891}
6892
6893PyObject*
6894PyUnicode_AsLatin1String(PyObject *unicode)
6895{
6896 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897}
6898
6899/* --- 7-bit ASCII Codec -------------------------------------------------- */
6900
Alexander Belopolsky40018472011-02-26 01:02:56 +00006901PyObject *
6902PyUnicode_DecodeASCII(const char *s,
6903 Py_ssize_t size,
6904 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006906 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006907 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006908 int kind;
6909 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006910 Py_ssize_t startinpos;
6911 Py_ssize_t endinpos;
6912 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006913 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006914 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006915 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006916 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006917
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006919 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006920
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006922 if (size == 1 && (unsigned char)s[0] < 128)
6923 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006924
Victor Stinner8f674cc2013-04-17 23:02:17 +02006925 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006926 writer.min_length = size;
6927 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006928 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006929
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006930 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006931 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006932 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006933 writer.pos = outpos;
6934 if (writer.pos == size)
6935 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006936
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006937 s += writer.pos;
6938 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006939 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006940 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006941 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006942 PyUnicode_WRITE(kind, data, writer.pos, c);
6943 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006944 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006945 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006946 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006947
6948 /* byte outsize range 0x00..0x7f: call the error handler */
6949
6950 if (error_handler == _Py_ERROR_UNKNOWN)
6951 error_handler = get_error_handler(errors);
6952
6953 switch (error_handler)
6954 {
6955 case _Py_ERROR_REPLACE:
6956 case _Py_ERROR_SURROGATEESCAPE:
6957 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006958 but we may switch to UCS2 at the first write */
6959 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6960 goto onError;
6961 kind = writer.kind;
6962 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006963
6964 if (error_handler == _Py_ERROR_REPLACE)
6965 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6966 else
6967 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6968 writer.pos++;
6969 ++s;
6970 break;
6971
6972 case _Py_ERROR_IGNORE:
6973 ++s;
6974 break;
6975
6976 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 startinpos = s-starts;
6978 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006979 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006980 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006981 "ascii", "ordinal not in range(128)",
6982 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006983 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006984 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006985 kind = writer.kind;
6986 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006987 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006989 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006990 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006991 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006992
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006994 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006995 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006996 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997 return NULL;
6998}
6999
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007000/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007001PyObject *
7002PyUnicode_EncodeASCII(const Py_UNICODE *p,
7003 Py_ssize_t size,
7004 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007006 PyObject *result;
7007 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7008 if (unicode == NULL)
7009 return NULL;
7010 result = unicode_encode_ucs1(unicode, errors, 128);
7011 Py_DECREF(unicode);
7012 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013}
7014
Alexander Belopolsky40018472011-02-26 01:02:56 +00007015PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007016_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017{
7018 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 PyErr_BadArgument();
7020 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007022 if (PyUnicode_READY(unicode) == -1)
7023 return NULL;
7024 /* Fast path: if it is an ASCII-only string, construct bytes object
7025 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007026 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007027 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7028 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007029 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007030}
7031
7032PyObject *
7033PyUnicode_AsASCIIString(PyObject *unicode)
7034{
7035 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036}
7037
Steve Dowercc16be82016-09-08 10:35:16 -07007038#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007039
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007040/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007041
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007042#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007043#define NEED_RETRY
7044#endif
7045
Victor Stinner3a50e702011-10-18 21:21:00 +02007046#ifndef WC_ERR_INVALID_CHARS
7047# define WC_ERR_INVALID_CHARS 0x0080
7048#endif
7049
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007050static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007051code_page_name(UINT code_page, PyObject **obj)
7052{
7053 *obj = NULL;
7054 if (code_page == CP_ACP)
7055 return "mbcs";
7056 if (code_page == CP_UTF7)
7057 return "CP_UTF7";
7058 if (code_page == CP_UTF8)
7059 return "CP_UTF8";
7060
7061 *obj = PyBytes_FromFormat("cp%u", code_page);
7062 if (*obj == NULL)
7063 return NULL;
7064 return PyBytes_AS_STRING(*obj);
7065}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007066
Victor Stinner3a50e702011-10-18 21:21:00 +02007067static DWORD
7068decode_code_page_flags(UINT code_page)
7069{
7070 if (code_page == CP_UTF7) {
7071 /* The CP_UTF7 decoder only supports flags=0 */
7072 return 0;
7073 }
7074 else
7075 return MB_ERR_INVALID_CHARS;
7076}
7077
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007078/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007079 * Decode a byte string from a Windows code page into unicode object in strict
7080 * mode.
7081 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007082 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7083 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007084 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007085static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007086decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007087 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007088 const char *in,
7089 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007090{
Victor Stinner3a50e702011-10-18 21:21:00 +02007091 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007092 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007093 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007094
7095 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007096 assert(insize > 0);
7097 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7098 if (outsize <= 0)
7099 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100
7101 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007102 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007103 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007104 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 if (*v == NULL)
7106 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007107 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007108 }
7109 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007110 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007111 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007112 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007114 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007115 }
7116
7117 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007118 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7119 if (outsize <= 0)
7120 goto error;
7121 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007122
Victor Stinner3a50e702011-10-18 21:21:00 +02007123error:
7124 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7125 return -2;
7126 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007127 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007128}
7129
Victor Stinner3a50e702011-10-18 21:21:00 +02007130/*
7131 * Decode a byte string from a code page into unicode object with an error
7132 * handler.
7133 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007134 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 * UnicodeDecodeError exception and returns -1 on error.
7136 */
7137static int
7138decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007139 PyObject **v,
7140 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007141 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007142{
7143 const char *startin = in;
7144 const char *endin = in + size;
7145 const DWORD flags = decode_code_page_flags(code_page);
7146 /* Ideally, we should get reason from FormatMessage. This is the Windows
7147 2000 English version of the message. */
7148 const char *reason = "No mapping for the Unicode character exists "
7149 "in the target code page.";
7150 /* each step cannot decode more than 1 character, but a character can be
7151 represented as a surrogate pair */
7152 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007153 int insize;
7154 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 PyObject *errorHandler = NULL;
7156 PyObject *exc = NULL;
7157 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007158 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007159 DWORD err;
7160 int ret = -1;
7161
7162 assert(size > 0);
7163
7164 encoding = code_page_name(code_page, &encoding_obj);
7165 if (encoding == NULL)
7166 return -1;
7167
Victor Stinner7d00cc12014-03-17 23:08:06 +01007168 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007169 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7170 UnicodeDecodeError. */
7171 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7172 if (exc != NULL) {
7173 PyCodec_StrictErrors(exc);
7174 Py_CLEAR(exc);
7175 }
7176 goto error;
7177 }
7178
7179 if (*v == NULL) {
7180 /* Create unicode object */
7181 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7182 PyErr_NoMemory();
7183 goto error;
7184 }
Victor Stinnerab595942011-12-17 04:59:06 +01007185 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007186 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007187 if (*v == NULL)
7188 goto error;
7189 startout = PyUnicode_AS_UNICODE(*v);
7190 }
7191 else {
7192 /* Extend unicode object */
7193 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7194 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7195 PyErr_NoMemory();
7196 goto error;
7197 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007198 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007199 goto error;
7200 startout = PyUnicode_AS_UNICODE(*v) + n;
7201 }
7202
7203 /* Decode the byte string character per character */
7204 out = startout;
7205 while (in < endin)
7206 {
7207 /* Decode a character */
7208 insize = 1;
7209 do
7210 {
7211 outsize = MultiByteToWideChar(code_page, flags,
7212 in, insize,
7213 buffer, Py_ARRAY_LENGTH(buffer));
7214 if (outsize > 0)
7215 break;
7216 err = GetLastError();
7217 if (err != ERROR_NO_UNICODE_TRANSLATION
7218 && err != ERROR_INSUFFICIENT_BUFFER)
7219 {
7220 PyErr_SetFromWindowsErr(0);
7221 goto error;
7222 }
7223 insize++;
7224 }
7225 /* 4=maximum length of a UTF-8 sequence */
7226 while (insize <= 4 && (in + insize) <= endin);
7227
7228 if (outsize <= 0) {
7229 Py_ssize_t startinpos, endinpos, outpos;
7230
Victor Stinner7d00cc12014-03-17 23:08:06 +01007231 /* last character in partial decode? */
7232 if (in + insize >= endin && !final)
7233 break;
7234
Victor Stinner3a50e702011-10-18 21:21:00 +02007235 startinpos = in - startin;
7236 endinpos = startinpos + 1;
7237 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007238 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007239 errors, &errorHandler,
7240 encoding, reason,
7241 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007242 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007243 {
7244 goto error;
7245 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007246 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007247 }
7248 else {
7249 in += insize;
7250 memcpy(out, buffer, outsize * sizeof(wchar_t));
7251 out += outsize;
7252 }
7253 }
7254
7255 /* write a NUL character at the end */
7256 *out = 0;
7257
7258 /* Extend unicode object */
7259 outsize = out - startout;
7260 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007261 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007262 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007263 /* (in - startin) <= size and size is an int */
7264 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007265
7266error:
7267 Py_XDECREF(encoding_obj);
7268 Py_XDECREF(errorHandler);
7269 Py_XDECREF(exc);
7270 return ret;
7271}
7272
Victor Stinner3a50e702011-10-18 21:21:00 +02007273static PyObject *
7274decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007275 const char *s, Py_ssize_t size,
7276 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007277{
Victor Stinner76a31a62011-11-04 00:05:13 +01007278 PyObject *v = NULL;
7279 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007280
Victor Stinner3a50e702011-10-18 21:21:00 +02007281 if (code_page < 0) {
7282 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7283 return NULL;
7284 }
7285
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007286 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007288
Victor Stinner76a31a62011-11-04 00:05:13 +01007289 do
7290 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007291#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007292 if (size > INT_MAX) {
7293 chunk_size = INT_MAX;
7294 final = 0;
7295 done = 0;
7296 }
7297 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007298#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007299 {
7300 chunk_size = (int)size;
7301 final = (consumed == NULL);
7302 done = 1;
7303 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007304
Victor Stinner76a31a62011-11-04 00:05:13 +01007305 if (chunk_size == 0 && done) {
7306 if (v != NULL)
7307 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007308 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007309 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007310
Victor Stinner76a31a62011-11-04 00:05:13 +01007311 converted = decode_code_page_strict(code_page, &v,
7312 s, chunk_size);
7313 if (converted == -2)
7314 converted = decode_code_page_errors(code_page, &v,
7315 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007316 errors, final);
7317 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007318
7319 if (converted < 0) {
7320 Py_XDECREF(v);
7321 return NULL;
7322 }
7323
7324 if (consumed)
7325 *consumed += converted;
7326
7327 s += converted;
7328 size -= converted;
7329 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007330
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007331 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007332}
7333
Alexander Belopolsky40018472011-02-26 01:02:56 +00007334PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007335PyUnicode_DecodeCodePageStateful(int code_page,
7336 const char *s,
7337 Py_ssize_t size,
7338 const char *errors,
7339 Py_ssize_t *consumed)
7340{
7341 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7342}
7343
7344PyObject *
7345PyUnicode_DecodeMBCSStateful(const char *s,
7346 Py_ssize_t size,
7347 const char *errors,
7348 Py_ssize_t *consumed)
7349{
7350 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7351}
7352
7353PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007354PyUnicode_DecodeMBCS(const char *s,
7355 Py_ssize_t size,
7356 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007357{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007358 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7359}
7360
Victor Stinner3a50e702011-10-18 21:21:00 +02007361static DWORD
7362encode_code_page_flags(UINT code_page, const char *errors)
7363{
7364 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007365 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007366 }
7367 else if (code_page == CP_UTF7) {
7368 /* CP_UTF7 only supports flags=0 */
7369 return 0;
7370 }
7371 else {
7372 if (errors != NULL && strcmp(errors, "replace") == 0)
7373 return 0;
7374 else
7375 return WC_NO_BEST_FIT_CHARS;
7376 }
7377}
7378
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007379/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007380 * Encode a Unicode string to a Windows code page into a byte string in strict
7381 * mode.
7382 *
7383 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007384 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007385 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007386static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007387encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007388 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007390{
Victor Stinner554f3f02010-06-16 23:33:54 +00007391 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007392 BOOL *pusedDefaultChar = &usedDefaultChar;
7393 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007394 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007395 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 const DWORD flags = encode_code_page_flags(code_page, NULL);
7397 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007398 /* Create a substring so that we can get the UTF-16 representation
7399 of just the slice under consideration. */
7400 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007401
Martin v. Löwis3d325192011-11-04 18:23:06 +01007402 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007403
Victor Stinner3a50e702011-10-18 21:21:00 +02007404 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007405 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007406 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007407 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007408
Victor Stinner2fc507f2011-11-04 20:06:39 +01007409 substring = PyUnicode_Substring(unicode, offset, offset+len);
7410 if (substring == NULL)
7411 return -1;
7412 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7413 if (p == NULL) {
7414 Py_DECREF(substring);
7415 return -1;
7416 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007417 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007418
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007419 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007420 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007421 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007422 NULL, 0,
7423 NULL, pusedDefaultChar);
7424 if (outsize <= 0)
7425 goto error;
7426 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007427 if (pusedDefaultChar && *pusedDefaultChar) {
7428 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007430 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007431
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007435 if (*outbytes == NULL) {
7436 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007438 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007440 }
7441 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 const Py_ssize_t n = PyBytes_Size(*outbytes);
7444 if (outsize > PY_SSIZE_T_MAX - n) {
7445 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007446 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007448 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007449 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7450 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007451 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007452 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007454 }
7455
7456 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007458 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 out, outsize,
7460 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007461 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 if (outsize <= 0)
7463 goto error;
7464 if (pusedDefaultChar && *pusedDefaultChar)
7465 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007466 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007467
Victor Stinner3a50e702011-10-18 21:21:00 +02007468error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007469 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007470 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7471 return -2;
7472 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007473 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007474}
7475
Victor Stinner3a50e702011-10-18 21:21:00 +02007476/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007477 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 * error handler.
7479 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007480 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007481 * -1 on other error.
7482 */
7483static int
7484encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007485 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007486 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007487{
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007489 Py_ssize_t pos = unicode_offset;
7490 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 /* Ideally, we should get reason from FormatMessage. This is the Windows
7492 2000 English version of the message. */
7493 const char *reason = "invalid character";
7494 /* 4=maximum length of a UTF-8 sequence */
7495 char buffer[4];
7496 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7497 Py_ssize_t outsize;
7498 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 PyObject *errorHandler = NULL;
7500 PyObject *exc = NULL;
7501 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007502 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007503 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007504 PyObject *rep;
7505 int ret = -1;
7506
7507 assert(insize > 0);
7508
7509 encoding = code_page_name(code_page, &encoding_obj);
7510 if (encoding == NULL)
7511 return -1;
7512
7513 if (errors == NULL || strcmp(errors, "strict") == 0) {
7514 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7515 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007516 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007517 if (exc != NULL) {
7518 PyCodec_StrictErrors(exc);
7519 Py_DECREF(exc);
7520 }
7521 Py_XDECREF(encoding_obj);
7522 return -1;
7523 }
7524
7525 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7526 pusedDefaultChar = &usedDefaultChar;
7527 else
7528 pusedDefaultChar = NULL;
7529
7530 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7531 PyErr_NoMemory();
7532 goto error;
7533 }
7534 outsize = insize * Py_ARRAY_LENGTH(buffer);
7535
7536 if (*outbytes == NULL) {
7537 /* Create string object */
7538 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7539 if (*outbytes == NULL)
7540 goto error;
7541 out = PyBytes_AS_STRING(*outbytes);
7542 }
7543 else {
7544 /* Extend string object */
7545 Py_ssize_t n = PyBytes_Size(*outbytes);
7546 if (n > PY_SSIZE_T_MAX - outsize) {
7547 PyErr_NoMemory();
7548 goto error;
7549 }
7550 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7551 goto error;
7552 out = PyBytes_AS_STRING(*outbytes) + n;
7553 }
7554
7555 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007556 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007557 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007558 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7559 wchar_t chars[2];
7560 int charsize;
7561 if (ch < 0x10000) {
7562 chars[0] = (wchar_t)ch;
7563 charsize = 1;
7564 }
7565 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007566 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7567 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007568 charsize = 2;
7569 }
7570
Victor Stinner3a50e702011-10-18 21:21:00 +02007571 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007572 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007573 buffer, Py_ARRAY_LENGTH(buffer),
7574 NULL, pusedDefaultChar);
7575 if (outsize > 0) {
7576 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7577 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007578 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007579 memcpy(out, buffer, outsize);
7580 out += outsize;
7581 continue;
7582 }
7583 }
7584 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7585 PyErr_SetFromWindowsErr(0);
7586 goto error;
7587 }
7588
Victor Stinner3a50e702011-10-18 21:21:00 +02007589 rep = unicode_encode_call_errorhandler(
7590 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007591 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007592 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007593 if (rep == NULL)
7594 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007595 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007596
7597 if (PyBytes_Check(rep)) {
7598 outsize = PyBytes_GET_SIZE(rep);
7599 if (outsize != 1) {
7600 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7601 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7602 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7603 Py_DECREF(rep);
7604 goto error;
7605 }
7606 out = PyBytes_AS_STRING(*outbytes) + offset;
7607 }
7608 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7609 out += outsize;
7610 }
7611 else {
7612 Py_ssize_t i;
7613 enum PyUnicode_Kind kind;
7614 void *data;
7615
Benjamin Petersonbac79492012-01-14 13:34:47 -05007616 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007617 Py_DECREF(rep);
7618 goto error;
7619 }
7620
7621 outsize = PyUnicode_GET_LENGTH(rep);
7622 if (outsize != 1) {
7623 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7624 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7625 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7626 Py_DECREF(rep);
7627 goto error;
7628 }
7629 out = PyBytes_AS_STRING(*outbytes) + offset;
7630 }
7631 kind = PyUnicode_KIND(rep);
7632 data = PyUnicode_DATA(rep);
7633 for (i=0; i < outsize; i++) {
7634 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7635 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007636 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007637 encoding, unicode,
7638 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007639 "unable to encode error handler result to ASCII");
7640 Py_DECREF(rep);
7641 goto error;
7642 }
7643 *out = (unsigned char)ch;
7644 out++;
7645 }
7646 }
7647 Py_DECREF(rep);
7648 }
7649 /* write a NUL byte */
7650 *out = 0;
7651 outsize = out - PyBytes_AS_STRING(*outbytes);
7652 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7653 if (_PyBytes_Resize(outbytes, outsize) < 0)
7654 goto error;
7655 ret = 0;
7656
7657error:
7658 Py_XDECREF(encoding_obj);
7659 Py_XDECREF(errorHandler);
7660 Py_XDECREF(exc);
7661 return ret;
7662}
7663
Victor Stinner3a50e702011-10-18 21:21:00 +02007664static PyObject *
7665encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007666 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007667 const char *errors)
7668{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007669 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007670 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007671 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007672 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007673
Victor Stinner29dacf22015-01-26 16:41:32 +01007674 if (!PyUnicode_Check(unicode)) {
7675 PyErr_BadArgument();
7676 return NULL;
7677 }
7678
Benjamin Petersonbac79492012-01-14 13:34:47 -05007679 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007680 return NULL;
7681 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007682
Victor Stinner3a50e702011-10-18 21:21:00 +02007683 if (code_page < 0) {
7684 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7685 return NULL;
7686 }
7687
Martin v. Löwis3d325192011-11-04 18:23:06 +01007688 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007689 return PyBytes_FromStringAndSize(NULL, 0);
7690
Victor Stinner7581cef2011-11-03 22:32:33 +01007691 offset = 0;
7692 do
7693 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007694#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007695 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007696 chunks. */
7697 if (len > INT_MAX/2) {
7698 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007699 done = 0;
7700 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007701 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007702#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007703 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007704 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007705 done = 1;
7706 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007707
Victor Stinner76a31a62011-11-04 00:05:13 +01007708 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007709 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007710 errors);
7711 if (ret == -2)
7712 ret = encode_code_page_errors(code_page, &outbytes,
7713 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007714 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007715 if (ret < 0) {
7716 Py_XDECREF(outbytes);
7717 return NULL;
7718 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007719
Victor Stinner7581cef2011-11-03 22:32:33 +01007720 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007721 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007722 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007723
Victor Stinner3a50e702011-10-18 21:21:00 +02007724 return outbytes;
7725}
7726
7727PyObject *
7728PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7729 Py_ssize_t size,
7730 const char *errors)
7731{
Victor Stinner7581cef2011-11-03 22:32:33 +01007732 PyObject *unicode, *res;
7733 unicode = PyUnicode_FromUnicode(p, size);
7734 if (unicode == NULL)
7735 return NULL;
7736 res = encode_code_page(CP_ACP, unicode, errors);
7737 Py_DECREF(unicode);
7738 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007739}
7740
7741PyObject *
7742PyUnicode_EncodeCodePage(int code_page,
7743 PyObject *unicode,
7744 const char *errors)
7745{
Victor Stinner7581cef2011-11-03 22:32:33 +01007746 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007747}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007748
Alexander Belopolsky40018472011-02-26 01:02:56 +00007749PyObject *
7750PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007751{
Victor Stinner7581cef2011-11-03 22:32:33 +01007752 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007753}
7754
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007755#undef NEED_RETRY
7756
Steve Dowercc16be82016-09-08 10:35:16 -07007757#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007758
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759/* --- Character Mapping Codec -------------------------------------------- */
7760
Victor Stinnerfb161b12013-04-18 01:44:27 +02007761static int
7762charmap_decode_string(const char *s,
7763 Py_ssize_t size,
7764 PyObject *mapping,
7765 const char *errors,
7766 _PyUnicodeWriter *writer)
7767{
7768 const char *starts = s;
7769 const char *e;
7770 Py_ssize_t startinpos, endinpos;
7771 PyObject *errorHandler = NULL, *exc = NULL;
7772 Py_ssize_t maplen;
7773 enum PyUnicode_Kind mapkind;
7774 void *mapdata;
7775 Py_UCS4 x;
7776 unsigned char ch;
7777
7778 if (PyUnicode_READY(mapping) == -1)
7779 return -1;
7780
7781 maplen = PyUnicode_GET_LENGTH(mapping);
7782 mapdata = PyUnicode_DATA(mapping);
7783 mapkind = PyUnicode_KIND(mapping);
7784
7785 e = s + size;
7786
7787 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7788 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7789 * is disabled in encoding aliases, latin1 is preferred because
7790 * its implementation is faster. */
7791 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7792 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7793 Py_UCS4 maxchar = writer->maxchar;
7794
7795 assert (writer->kind == PyUnicode_1BYTE_KIND);
7796 while (s < e) {
7797 ch = *s;
7798 x = mapdata_ucs1[ch];
7799 if (x > maxchar) {
7800 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7801 goto onError;
7802 maxchar = writer->maxchar;
7803 outdata = (Py_UCS1 *)writer->data;
7804 }
7805 outdata[writer->pos] = x;
7806 writer->pos++;
7807 ++s;
7808 }
7809 return 0;
7810 }
7811
7812 while (s < e) {
7813 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7814 enum PyUnicode_Kind outkind = writer->kind;
7815 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7816 if (outkind == PyUnicode_1BYTE_KIND) {
7817 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7818 Py_UCS4 maxchar = writer->maxchar;
7819 while (s < e) {
7820 ch = *s;
7821 x = mapdata_ucs2[ch];
7822 if (x > maxchar)
7823 goto Error;
7824 outdata[writer->pos] = x;
7825 writer->pos++;
7826 ++s;
7827 }
7828 break;
7829 }
7830 else if (outkind == PyUnicode_2BYTE_KIND) {
7831 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7832 while (s < e) {
7833 ch = *s;
7834 x = mapdata_ucs2[ch];
7835 if (x == 0xFFFE)
7836 goto Error;
7837 outdata[writer->pos] = x;
7838 writer->pos++;
7839 ++s;
7840 }
7841 break;
7842 }
7843 }
7844 ch = *s;
7845
7846 if (ch < maplen)
7847 x = PyUnicode_READ(mapkind, mapdata, ch);
7848 else
7849 x = 0xfffe; /* invalid value */
7850Error:
7851 if (x == 0xfffe)
7852 {
7853 /* undefined mapping */
7854 startinpos = s-starts;
7855 endinpos = startinpos+1;
7856 if (unicode_decode_call_errorhandler_writer(
7857 errors, &errorHandler,
7858 "charmap", "character maps to <undefined>",
7859 &starts, &e, &startinpos, &endinpos, &exc, &s,
7860 writer)) {
7861 goto onError;
7862 }
7863 continue;
7864 }
7865
7866 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7867 goto onError;
7868 ++s;
7869 }
7870 Py_XDECREF(errorHandler);
7871 Py_XDECREF(exc);
7872 return 0;
7873
7874onError:
7875 Py_XDECREF(errorHandler);
7876 Py_XDECREF(exc);
7877 return -1;
7878}
7879
7880static int
7881charmap_decode_mapping(const char *s,
7882 Py_ssize_t size,
7883 PyObject *mapping,
7884 const char *errors,
7885 _PyUnicodeWriter *writer)
7886{
7887 const char *starts = s;
7888 const char *e;
7889 Py_ssize_t startinpos, endinpos;
7890 PyObject *errorHandler = NULL, *exc = NULL;
7891 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007892 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007893
7894 e = s + size;
7895
7896 while (s < e) {
7897 ch = *s;
7898
7899 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7900 key = PyLong_FromLong((long)ch);
7901 if (key == NULL)
7902 goto onError;
7903
7904 item = PyObject_GetItem(mapping, key);
7905 Py_DECREF(key);
7906 if (item == NULL) {
7907 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7908 /* No mapping found means: mapping is undefined. */
7909 PyErr_Clear();
7910 goto Undefined;
7911 } else
7912 goto onError;
7913 }
7914
7915 /* Apply mapping */
7916 if (item == Py_None)
7917 goto Undefined;
7918 if (PyLong_Check(item)) {
7919 long value = PyLong_AS_LONG(item);
7920 if (value == 0xFFFE)
7921 goto Undefined;
7922 if (value < 0 || value > MAX_UNICODE) {
7923 PyErr_Format(PyExc_TypeError,
7924 "character mapping must be in range(0x%lx)",
7925 (unsigned long)MAX_UNICODE + 1);
7926 goto onError;
7927 }
7928
7929 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7930 goto onError;
7931 }
7932 else if (PyUnicode_Check(item)) {
7933 if (PyUnicode_READY(item) == -1)
7934 goto onError;
7935 if (PyUnicode_GET_LENGTH(item) == 1) {
7936 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7937 if (value == 0xFFFE)
7938 goto Undefined;
7939 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7940 goto onError;
7941 }
7942 else {
7943 writer->overallocate = 1;
7944 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7945 goto onError;
7946 }
7947 }
7948 else {
7949 /* wrong return value */
7950 PyErr_SetString(PyExc_TypeError,
7951 "character mapping must return integer, None or str");
7952 goto onError;
7953 }
7954 Py_CLEAR(item);
7955 ++s;
7956 continue;
7957
7958Undefined:
7959 /* undefined mapping */
7960 Py_CLEAR(item);
7961 startinpos = s-starts;
7962 endinpos = startinpos+1;
7963 if (unicode_decode_call_errorhandler_writer(
7964 errors, &errorHandler,
7965 "charmap", "character maps to <undefined>",
7966 &starts, &e, &startinpos, &endinpos, &exc, &s,
7967 writer)) {
7968 goto onError;
7969 }
7970 }
7971 Py_XDECREF(errorHandler);
7972 Py_XDECREF(exc);
7973 return 0;
7974
7975onError:
7976 Py_XDECREF(item);
7977 Py_XDECREF(errorHandler);
7978 Py_XDECREF(exc);
7979 return -1;
7980}
7981
Alexander Belopolsky40018472011-02-26 01:02:56 +00007982PyObject *
7983PyUnicode_DecodeCharmap(const char *s,
7984 Py_ssize_t size,
7985 PyObject *mapping,
7986 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007988 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007989
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990 /* Default to Latin-1 */
7991 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007995 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007996 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007997 writer.min_length = size;
7998 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008000
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008001 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008002 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8003 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008004 }
8005 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008006 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8007 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008009 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008010
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008012 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013 return NULL;
8014}
8015
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008016/* Charmap encoding: the lookup table */
8017
Alexander Belopolsky40018472011-02-26 01:02:56 +00008018struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 PyObject_HEAD
8020 unsigned char level1[32];
8021 int count2, count3;
8022 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008023};
8024
8025static PyObject*
8026encoding_map_size(PyObject *obj, PyObject* args)
8027{
8028 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008029 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008031}
8032
8033static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008034 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 PyDoc_STR("Return the size (in bytes) of this object") },
8036 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008037};
8038
8039static void
8040encoding_map_dealloc(PyObject* o)
8041{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008042 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008043}
8044
8045static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008046 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 "EncodingMap", /*tp_name*/
8048 sizeof(struct encoding_map), /*tp_basicsize*/
8049 0, /*tp_itemsize*/
8050 /* methods */
8051 encoding_map_dealloc, /*tp_dealloc*/
8052 0, /*tp_print*/
8053 0, /*tp_getattr*/
8054 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008055 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 0, /*tp_repr*/
8057 0, /*tp_as_number*/
8058 0, /*tp_as_sequence*/
8059 0, /*tp_as_mapping*/
8060 0, /*tp_hash*/
8061 0, /*tp_call*/
8062 0, /*tp_str*/
8063 0, /*tp_getattro*/
8064 0, /*tp_setattro*/
8065 0, /*tp_as_buffer*/
8066 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8067 0, /*tp_doc*/
8068 0, /*tp_traverse*/
8069 0, /*tp_clear*/
8070 0, /*tp_richcompare*/
8071 0, /*tp_weaklistoffset*/
8072 0, /*tp_iter*/
8073 0, /*tp_iternext*/
8074 encoding_map_methods, /*tp_methods*/
8075 0, /*tp_members*/
8076 0, /*tp_getset*/
8077 0, /*tp_base*/
8078 0, /*tp_dict*/
8079 0, /*tp_descr_get*/
8080 0, /*tp_descr_set*/
8081 0, /*tp_dictoffset*/
8082 0, /*tp_init*/
8083 0, /*tp_alloc*/
8084 0, /*tp_new*/
8085 0, /*tp_free*/
8086 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008087};
8088
8089PyObject*
8090PyUnicode_BuildEncodingMap(PyObject* string)
8091{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008092 PyObject *result;
8093 struct encoding_map *mresult;
8094 int i;
8095 int need_dict = 0;
8096 unsigned char level1[32];
8097 unsigned char level2[512];
8098 unsigned char *mlevel1, *mlevel2, *mlevel3;
8099 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008100 int kind;
8101 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008102 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008103 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008104
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008105 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008106 PyErr_BadArgument();
8107 return NULL;
8108 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008109 kind = PyUnicode_KIND(string);
8110 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008111 length = PyUnicode_GET_LENGTH(string);
8112 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008113 memset(level1, 0xFF, sizeof level1);
8114 memset(level2, 0xFF, sizeof level2);
8115
8116 /* If there isn't a one-to-one mapping of NULL to \0,
8117 or if there are non-BMP characters, we need to use
8118 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008120 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008121 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008122 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008123 ch = PyUnicode_READ(kind, data, i);
8124 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008125 need_dict = 1;
8126 break;
8127 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008128 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008129 /* unmapped character */
8130 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008131 l1 = ch >> 11;
8132 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008133 if (level1[l1] == 0xFF)
8134 level1[l1] = count2++;
8135 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008136 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008137 }
8138
8139 if (count2 >= 0xFF || count3 >= 0xFF)
8140 need_dict = 1;
8141
8142 if (need_dict) {
8143 PyObject *result = PyDict_New();
8144 PyObject *key, *value;
8145 if (!result)
8146 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008147 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008148 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008149 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150 if (!key || !value)
8151 goto failed1;
8152 if (PyDict_SetItem(result, key, value) == -1)
8153 goto failed1;
8154 Py_DECREF(key);
8155 Py_DECREF(value);
8156 }
8157 return result;
8158 failed1:
8159 Py_XDECREF(key);
8160 Py_XDECREF(value);
8161 Py_DECREF(result);
8162 return NULL;
8163 }
8164
8165 /* Create a three-level trie */
8166 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8167 16*count2 + 128*count3 - 1);
8168 if (!result)
8169 return PyErr_NoMemory();
8170 PyObject_Init(result, &EncodingMapType);
8171 mresult = (struct encoding_map*)result;
8172 mresult->count2 = count2;
8173 mresult->count3 = count3;
8174 mlevel1 = mresult->level1;
8175 mlevel2 = mresult->level23;
8176 mlevel3 = mresult->level23 + 16*count2;
8177 memcpy(mlevel1, level1, 32);
8178 memset(mlevel2, 0xFF, 16*count2);
8179 memset(mlevel3, 0, 128*count3);
8180 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008181 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008182 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008183 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8184 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008185 /* unmapped character */
8186 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008187 o1 = ch>>11;
8188 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008189 i2 = 16*mlevel1[o1] + o2;
8190 if (mlevel2[i2] == 0xFF)
8191 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008192 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008193 i3 = 128*mlevel2[i2] + o3;
8194 mlevel3[i3] = i;
8195 }
8196 return result;
8197}
8198
8199static int
Victor Stinner22168992011-11-20 17:09:18 +01008200encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008201{
8202 struct encoding_map *map = (struct encoding_map*)mapping;
8203 int l1 = c>>11;
8204 int l2 = (c>>7) & 0xF;
8205 int l3 = c & 0x7F;
8206 int i;
8207
Victor Stinner22168992011-11-20 17:09:18 +01008208 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008210 if (c == 0)
8211 return 0;
8212 /* level 1*/
8213 i = map->level1[l1];
8214 if (i == 0xFF) {
8215 return -1;
8216 }
8217 /* level 2*/
8218 i = map->level23[16*i+l2];
8219 if (i == 0xFF) {
8220 return -1;
8221 }
8222 /* level 3 */
8223 i = map->level23[16*map->count2 + 128*i + l3];
8224 if (i == 0) {
8225 return -1;
8226 }
8227 return i;
8228}
8229
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008230/* Lookup the character ch in the mapping. If the character
8231 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008232 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008233static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008234charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235{
Christian Heimes217cfd12007-12-02 14:31:20 +00008236 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008237 PyObject *x;
8238
8239 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008241 x = PyObject_GetItem(mapping, w);
8242 Py_DECREF(w);
8243 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8245 /* No mapping found means: mapping is undefined. */
8246 PyErr_Clear();
8247 x = Py_None;
8248 Py_INCREF(x);
8249 return x;
8250 } else
8251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008253 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008255 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 long value = PyLong_AS_LONG(x);
8257 if (value < 0 || value > 255) {
8258 PyErr_SetString(PyExc_TypeError,
8259 "character mapping must be in range(256)");
8260 Py_DECREF(x);
8261 return NULL;
8262 }
8263 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008265 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 /* wrong return value */
8269 PyErr_Format(PyExc_TypeError,
8270 "character mapping must return integer, bytes or None, not %.400s",
8271 x->ob_type->tp_name);
8272 Py_DECREF(x);
8273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274 }
8275}
8276
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008277static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008278charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008279{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008280 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8281 /* exponentially overallocate to minimize reallocations */
8282 if (requiredsize < 2*outsize)
8283 requiredsize = 2*outsize;
8284 if (_PyBytes_Resize(outobj, requiredsize))
8285 return -1;
8286 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008287}
8288
Benjamin Peterson14339b62009-01-31 16:36:08 +00008289typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008291} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008292/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008293 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294 space is available. Return a new reference to the object that
8295 was put in the output buffer, or Py_None, if the mapping was undefined
8296 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008297 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008298static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008299charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008300 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008301{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008302 PyObject *rep;
8303 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008304 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305
Christian Heimes90aa7642007-12-19 02:45:37 +00008306 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008307 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008309 if (res == -1)
8310 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 if (outsize<requiredsize)
8312 if (charmapencode_resize(outobj, outpos, requiredsize))
8313 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008314 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 outstart[(*outpos)++] = (char)res;
8316 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008317 }
8318
8319 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008320 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008322 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 Py_DECREF(rep);
8324 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008325 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 if (PyLong_Check(rep)) {
8327 Py_ssize_t requiredsize = *outpos+1;
8328 if (outsize<requiredsize)
8329 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8330 Py_DECREF(rep);
8331 return enc_EXCEPTION;
8332 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008333 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008335 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 else {
8337 const char *repchars = PyBytes_AS_STRING(rep);
8338 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8339 Py_ssize_t requiredsize = *outpos+repsize;
8340 if (outsize<requiredsize)
8341 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8342 Py_DECREF(rep);
8343 return enc_EXCEPTION;
8344 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008345 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 memcpy(outstart + *outpos, repchars, repsize);
8347 *outpos += repsize;
8348 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008349 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008350 Py_DECREF(rep);
8351 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352}
8353
8354/* handle an error in PyUnicode_EncodeCharmap
8355 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008356static int
8357charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008358 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008360 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008361 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362{
8363 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008364 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008365 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008366 enum PyUnicode_Kind kind;
8367 void *data;
8368 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008370 Py_ssize_t collstartpos = *inpos;
8371 Py_ssize_t collendpos = *inpos+1;
8372 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008373 char *encoding = "charmap";
8374 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008375 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008376 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008377 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008378
Benjamin Petersonbac79492012-01-14 13:34:47 -05008379 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008380 return -1;
8381 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382 /* find all unencodable characters */
8383 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008384 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008385 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008386 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008387 val = encoding_map_lookup(ch, mapping);
8388 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 break;
8390 ++collendpos;
8391 continue;
8392 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008393
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008394 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8395 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 if (rep==NULL)
8397 return -1;
8398 else if (rep!=Py_None) {
8399 Py_DECREF(rep);
8400 break;
8401 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008402 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404 }
8405 /* cache callback name lookup
8406 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008407 if (*error_handler == _Py_ERROR_UNKNOWN)
8408 *error_handler = get_error_handler(errors);
8409
8410 switch (*error_handler) {
8411 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008412 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008413 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008414
8415 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008416 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 x = charmapencode_output('?', mapping, res, respos);
8418 if (x==enc_EXCEPTION) {
8419 return -1;
8420 }
8421 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008422 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 return -1;
8424 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008425 }
8426 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008427 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008428 *inpos = collendpos;
8429 break;
Victor Stinner50149202015-09-22 00:26:54 +02008430
8431 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432 /* generate replacement (temporarily (mis)uses p) */
8433 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 char buffer[2+29+1+1];
8435 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008436 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 for (cp = buffer; *cp; ++cp) {
8438 x = charmapencode_output(*cp, mapping, res, respos);
8439 if (x==enc_EXCEPTION)
8440 return -1;
8441 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008442 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 return -1;
8444 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008445 }
8446 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008447 *inpos = collendpos;
8448 break;
Victor Stinner50149202015-09-22 00:26:54 +02008449
Benjamin Peterson14339b62009-01-31 16:36:08 +00008450 default:
Victor Stinner50149202015-09-22 00:26:54 +02008451 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008452 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008454 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008456 if (PyBytes_Check(repunicode)) {
8457 /* Directly copy bytes result to output. */
8458 Py_ssize_t outsize = PyBytes_Size(*res);
8459 Py_ssize_t requiredsize;
8460 repsize = PyBytes_Size(repunicode);
8461 requiredsize = *respos + repsize;
8462 if (requiredsize > outsize)
8463 /* Make room for all additional bytes. */
8464 if (charmapencode_resize(res, respos, requiredsize)) {
8465 Py_DECREF(repunicode);
8466 return -1;
8467 }
8468 memcpy(PyBytes_AsString(*res) + *respos,
8469 PyBytes_AsString(repunicode), repsize);
8470 *respos += repsize;
8471 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008472 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008473 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008474 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008475 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008476 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008477 Py_DECREF(repunicode);
8478 return -1;
8479 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008480 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008481 data = PyUnicode_DATA(repunicode);
8482 kind = PyUnicode_KIND(repunicode);
8483 for (index = 0; index < repsize; index++) {
8484 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8485 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008487 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 return -1;
8489 }
8490 else if (x==enc_FAILED) {
8491 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008492 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 return -1;
8494 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008495 }
8496 *inpos = newpos;
8497 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008498 }
8499 return 0;
8500}
8501
Alexander Belopolsky40018472011-02-26 01:02:56 +00008502PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008503_PyUnicode_EncodeCharmap(PyObject *unicode,
8504 PyObject *mapping,
8505 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507 /* output object */
8508 PyObject *res = NULL;
8509 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008510 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008511 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008512 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008513 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008514 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008516 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008517 void *data;
8518 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519
Benjamin Petersonbac79492012-01-14 13:34:47 -05008520 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008521 return NULL;
8522 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008523 data = PyUnicode_DATA(unicode);
8524 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008525
Guido van Rossumd57fd912000-03-10 22:53:23 +00008526 /* Default to Latin-1 */
8527 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008528 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 /* allocate enough for a simple encoding without
8531 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008532 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008533 if (res == NULL)
8534 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008535 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008539 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008541 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 if (x==enc_EXCEPTION) /* error */
8543 goto onError;
8544 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008545 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008547 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008548 &res, &respos)) {
8549 goto onError;
8550 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008551 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 else
8553 /* done with this character => adjust input position */
8554 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008558 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008559 if (_PyBytes_Resize(&res, respos) < 0)
8560 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008561
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008562 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008563 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564 return res;
8565
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008567 Py_XDECREF(res);
8568 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008569 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570 return NULL;
8571}
8572
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008573/* Deprecated */
8574PyObject *
8575PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8576 Py_ssize_t size,
8577 PyObject *mapping,
8578 const char *errors)
8579{
8580 PyObject *result;
8581 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8582 if (unicode == NULL)
8583 return NULL;
8584 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8585 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008586 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008587}
8588
Alexander Belopolsky40018472011-02-26 01:02:56 +00008589PyObject *
8590PyUnicode_AsCharmapString(PyObject *unicode,
8591 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592{
8593 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 PyErr_BadArgument();
8595 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008597 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598}
8599
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008600/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008601static void
8602make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008604 Py_ssize_t startpos, Py_ssize_t endpos,
8605 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008607 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 *exceptionObject = _PyUnicodeTranslateError_Create(
8609 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610 }
8611 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8613 goto onError;
8614 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8615 goto onError;
8616 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8617 goto onError;
8618 return;
8619 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008620 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 }
8622}
8623
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008624/* error handling callback helper:
8625 build arguments, call the callback and check the arguments,
8626 put the result into newpos and return the replacement string, which
8627 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008628static PyObject *
8629unicode_translate_call_errorhandler(const char *errors,
8630 PyObject **errorHandler,
8631 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008633 Py_ssize_t startpos, Py_ssize_t endpos,
8634 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008635{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02008636 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008637
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008638 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008639 PyObject *restuple;
8640 PyObject *resunicode;
8641
8642 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008646 }
8647
8648 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008650 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008652
8653 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008655 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008657 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008658 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 Py_DECREF(restuple);
8660 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661 }
8662 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 &resunicode, &i_newpos)) {
8664 Py_DECREF(restuple);
8665 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008667 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008668 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008669 else
8670 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008672 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 Py_DECREF(restuple);
8674 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008675 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 Py_INCREF(resunicode);
8677 Py_DECREF(restuple);
8678 return resunicode;
8679}
8680
8681/* Lookup the character ch in the mapping and put the result in result,
8682 which must be decrefed by the caller.
8683 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008684static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008685charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686{
Christian Heimes217cfd12007-12-02 14:31:20 +00008687 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008688 PyObject *x;
8689
8690 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008692 x = PyObject_GetItem(mapping, w);
8693 Py_DECREF(w);
8694 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8696 /* No mapping found means: use 1:1 mapping. */
8697 PyErr_Clear();
8698 *result = NULL;
8699 return 0;
8700 } else
8701 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008702 }
8703 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 *result = x;
8705 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008706 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008707 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008709 if (value < 0 || value > MAX_UNICODE) {
8710 PyErr_Format(PyExc_ValueError,
8711 "character mapping must be in range(0x%x)",
8712 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 Py_DECREF(x);
8714 return -1;
8715 }
8716 *result = x;
8717 return 0;
8718 }
8719 else if (PyUnicode_Check(x)) {
8720 *result = x;
8721 return 0;
8722 }
8723 else {
8724 /* wrong return value */
8725 PyErr_SetString(PyExc_TypeError,
8726 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008727 Py_DECREF(x);
8728 return -1;
8729 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008730}
Victor Stinner1194ea02014-04-04 19:37:40 +02008731
8732/* lookup the character, write the result into the writer.
8733 Return 1 if the result was written into the writer, return 0 if the mapping
8734 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008735static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008736charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8737 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008738{
Victor Stinner1194ea02014-04-04 19:37:40 +02008739 PyObject *item;
8740
8741 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008742 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008743
8744 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008746 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008747 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008748 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008749 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008750 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008751
8752 if (item == Py_None) {
8753 Py_DECREF(item);
8754 return 0;
8755 }
8756
8757 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008758 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8759 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8760 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008761 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8762 Py_DECREF(item);
8763 return -1;
8764 }
8765 Py_DECREF(item);
8766 return 1;
8767 }
8768
8769 if (!PyUnicode_Check(item)) {
8770 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008772 }
8773
8774 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8775 Py_DECREF(item);
8776 return -1;
8777 }
8778
8779 Py_DECREF(item);
8780 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008781}
8782
Victor Stinner89a76ab2014-04-05 11:44:04 +02008783static int
8784unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8785 Py_UCS1 *translate)
8786{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008787 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008788 int ret = 0;
8789
Victor Stinner89a76ab2014-04-05 11:44:04 +02008790 if (charmaptranslate_lookup(ch, mapping, &item)) {
8791 return -1;
8792 }
8793
8794 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008795 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008796 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008797 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008798 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008799 /* not found => default to 1:1 mapping */
8800 translate[ch] = ch;
8801 return 1;
8802 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008803 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008804 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008805 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8806 used it */
8807 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008808 /* invalid character or character outside ASCII:
8809 skip the fast translate */
8810 goto exit;
8811 }
8812 translate[ch] = (Py_UCS1)replace;
8813 }
8814 else if (PyUnicode_Check(item)) {
8815 Py_UCS4 replace;
8816
8817 if (PyUnicode_READY(item) == -1) {
8818 Py_DECREF(item);
8819 return -1;
8820 }
8821 if (PyUnicode_GET_LENGTH(item) != 1)
8822 goto exit;
8823
8824 replace = PyUnicode_READ_CHAR(item, 0);
8825 if (replace > 127)
8826 goto exit;
8827 translate[ch] = (Py_UCS1)replace;
8828 }
8829 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008830 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008831 goto exit;
8832 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008833 ret = 1;
8834
Benjamin Peterson1365de72014-04-07 20:15:41 -04008835 exit:
8836 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008837 return ret;
8838}
8839
8840/* Fast path for ascii => ascii translation. Return 1 if the whole string
8841 was translated into writer, return 0 if the input string was partially
8842 translated into writer, raise an exception and return -1 on error. */
8843static int
8844unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008845 _PyUnicodeWriter *writer, int ignore,
8846 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008847{
Victor Stinner872b2912014-04-05 14:27:07 +02008848 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008849 Py_ssize_t len;
8850 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008851 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008852
Victor Stinner89a76ab2014-04-05 11:44:04 +02008853 len = PyUnicode_GET_LENGTH(input);
8854
Victor Stinner872b2912014-04-05 14:27:07 +02008855 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008856
8857 in = PyUnicode_1BYTE_DATA(input);
8858 end = in + len;
8859
8860 assert(PyUnicode_IS_ASCII(writer->buffer));
8861 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8862 out = PyUnicode_1BYTE_DATA(writer->buffer);
8863
Victor Stinner872b2912014-04-05 14:27:07 +02008864 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008865 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008866 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008867 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008868 int translate = unicode_fast_translate_lookup(mapping, ch,
8869 ascii_table);
8870 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008871 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008872 if (translate == 0)
8873 goto exit;
8874 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008875 }
Victor Stinner872b2912014-04-05 14:27:07 +02008876 if (ch2 == 0xfe) {
8877 if (ignore)
8878 continue;
8879 goto exit;
8880 }
8881 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008882 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008883 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008884 }
Victor Stinner872b2912014-04-05 14:27:07 +02008885 res = 1;
8886
8887exit:
8888 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008889 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008890 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008891}
8892
Victor Stinner3222da22015-10-01 22:07:32 +02008893static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008894_PyUnicode_TranslateCharmap(PyObject *input,
8895 PyObject *mapping,
8896 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008899 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 Py_ssize_t size, i;
8901 int kind;
8902 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008903 _PyUnicodeWriter writer;
8904 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008905 char *reason = "character maps to <undefined>";
8906 PyObject *errorHandler = NULL;
8907 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008908 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008909 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008910
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 PyErr_BadArgument();
8913 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008914 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 if (PyUnicode_READY(input) == -1)
8917 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008918 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919 kind = PyUnicode_KIND(input);
8920 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008921
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008922 if (size == 0)
8923 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008925 /* allocate enough for a simple 1:1 translation without
8926 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008927 _PyUnicodeWriter_Init(&writer);
8928 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930
Victor Stinner872b2912014-04-05 14:27:07 +02008931 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8932
Victor Stinner33798672016-03-01 21:59:58 +01008933 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008934 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008935 if (PyUnicode_IS_ASCII(input)) {
8936 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8937 if (res < 0) {
8938 _PyUnicodeWriter_Dealloc(&writer);
8939 return NULL;
8940 }
8941 if (res == 1)
8942 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008943 }
Victor Stinner33798672016-03-01 21:59:58 +01008944 else {
8945 i = 0;
8946 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008950 int translate;
8951 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8952 Py_ssize_t newpos;
8953 /* startpos for collecting untranslatable chars */
8954 Py_ssize_t collstart;
8955 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008956 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957
Victor Stinner1194ea02014-04-04 19:37:40 +02008958 ch = PyUnicode_READ(kind, data, i);
8959 translate = charmaptranslate_output(ch, mapping, &writer);
8960 if (translate < 0)
8961 goto onError;
8962
8963 if (translate != 0) {
8964 /* it worked => adjust input pointer */
8965 ++i;
8966 continue;
8967 }
8968
8969 /* untranslatable character */
8970 collstart = i;
8971 collend = i+1;
8972
8973 /* find all untranslatable characters */
8974 while (collend < size) {
8975 PyObject *x;
8976 ch = PyUnicode_READ(kind, data, collend);
8977 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008978 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008979 Py_XDECREF(x);
8980 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008981 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008982 ++collend;
8983 }
8984
8985 if (ignore) {
8986 i = collend;
8987 }
8988 else {
8989 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8990 reason, input, &exc,
8991 collstart, collend, &newpos);
8992 if (repunicode == NULL)
8993 goto onError;
8994 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008995 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008996 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008997 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008998 Py_DECREF(repunicode);
8999 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009000 }
9001 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009002 Py_XDECREF(exc);
9003 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009004 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005
Benjamin Peterson29060642009-01-31 22:14:21 +00009006 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009007 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009008 Py_XDECREF(exc);
9009 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010 return NULL;
9011}
9012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013/* Deprecated. Use PyUnicode_Translate instead. */
9014PyObject *
9015PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9016 Py_ssize_t size,
9017 PyObject *mapping,
9018 const char *errors)
9019{
Christian Heimes5f520f42012-09-11 14:03:25 +02009020 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009021 PyObject *unicode = PyUnicode_FromUnicode(p, size);
9022 if (!unicode)
9023 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009024 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9025 Py_DECREF(unicode);
9026 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027}
9028
Alexander Belopolsky40018472011-02-26 01:02:56 +00009029PyObject *
9030PyUnicode_Translate(PyObject *str,
9031 PyObject *mapping,
9032 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009034 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009035 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009036 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037}
Tim Petersced69f82003-09-16 20:30:58 +00009038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009040fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009041{
9042 /* No need to call PyUnicode_READY(self) because this function is only
9043 called as a callback from fixup() which does it already. */
9044 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9045 const int kind = PyUnicode_KIND(self);
9046 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009047 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009048 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049 Py_ssize_t i;
9050
9051 for (i = 0; i < len; ++i) {
9052 ch = PyUnicode_READ(kind, data, i);
9053 fixed = 0;
9054 if (ch > 127) {
9055 if (Py_UNICODE_ISSPACE(ch))
9056 fixed = ' ';
9057 else {
9058 const int decimal = Py_UNICODE_TODECIMAL(ch);
9059 if (decimal >= 0)
9060 fixed = '0' + decimal;
9061 }
9062 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009063 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009064 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009065 PyUnicode_WRITE(kind, data, i, fixed);
9066 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009067 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009068 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 }
9071
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009072 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073}
9074
9075PyObject *
9076_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9077{
9078 if (!PyUnicode_Check(unicode)) {
9079 PyErr_BadInternalCall();
9080 return NULL;
9081 }
9082 if (PyUnicode_READY(unicode) == -1)
9083 return NULL;
9084 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9085 /* If the string is already ASCII, just return the same string */
9086 Py_INCREF(unicode);
9087 return unicode;
9088 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009089 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090}
9091
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009092PyObject *
9093PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9094 Py_ssize_t length)
9095{
Victor Stinnerf0124502011-11-21 23:12:56 +01009096 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009097 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009098 Py_UCS4 maxchar;
9099 enum PyUnicode_Kind kind;
9100 void *data;
9101
Victor Stinner99d7ad02012-02-22 13:37:39 +01009102 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009103 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009104 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009105 if (ch > 127) {
9106 int decimal = Py_UNICODE_TODECIMAL(ch);
9107 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009108 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009109 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009110 }
9111 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009112
9113 /* Copy to a new string */
9114 decimal = PyUnicode_New(length, maxchar);
9115 if (decimal == NULL)
9116 return decimal;
9117 kind = PyUnicode_KIND(decimal);
9118 data = PyUnicode_DATA(decimal);
9119 /* Iterate over code points */
9120 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009121 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009122 if (ch > 127) {
9123 int decimal = Py_UNICODE_TODECIMAL(ch);
9124 if (decimal >= 0)
9125 ch = '0' + decimal;
9126 }
9127 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009128 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009129 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009130}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009131/* --- Decimal Encoder ---------------------------------------------------- */
9132
Alexander Belopolsky40018472011-02-26 01:02:56 +00009133int
9134PyUnicode_EncodeDecimal(Py_UNICODE *s,
9135 Py_ssize_t length,
9136 char *output,
9137 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009138{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009139 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009140 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009141 enum PyUnicode_Kind kind;
9142 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009143
9144 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009145 PyErr_BadArgument();
9146 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009147 }
9148
Victor Stinner42bf7752011-11-21 22:52:58 +01009149 unicode = PyUnicode_FromUnicode(s, length);
9150 if (unicode == NULL)
9151 return -1;
9152
Benjamin Petersonbac79492012-01-14 13:34:47 -05009153 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009154 Py_DECREF(unicode);
9155 return -1;
9156 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009157 kind = PyUnicode_KIND(unicode);
9158 data = PyUnicode_DATA(unicode);
9159
Victor Stinnerb84d7232011-11-22 01:50:07 +01009160 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009161 PyObject *exc;
9162 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009163 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009164 Py_ssize_t startpos;
9165
9166 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009167
Benjamin Peterson29060642009-01-31 22:14:21 +00009168 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009169 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009170 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009171 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009172 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009173 decimal = Py_UNICODE_TODECIMAL(ch);
9174 if (decimal >= 0) {
9175 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009176 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009177 continue;
9178 }
9179 if (0 < ch && ch < 256) {
9180 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009181 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009182 continue;
9183 }
Victor Stinner6345be92011-11-25 20:09:01 +01009184
Victor Stinner42bf7752011-11-21 22:52:58 +01009185 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009186 exc = NULL;
9187 raise_encode_exception(&exc, "decimal", unicode,
9188 startpos, startpos+1,
9189 "invalid decimal Unicode string");
9190 Py_XDECREF(exc);
9191 Py_DECREF(unicode);
9192 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009193 }
9194 /* 0-terminate the output string */
9195 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009196 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009197 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009198}
9199
Guido van Rossumd57fd912000-03-10 22:53:23 +00009200/* --- Helpers ------------------------------------------------------------ */
9201
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009202/* helper macro to fixup start/end slice values */
9203#define ADJUST_INDICES(start, end, len) \
9204 if (end > len) \
9205 end = len; \
9206 else if (end < 0) { \
9207 end += len; \
9208 if (end < 0) \
9209 end = 0; \
9210 } \
9211 if (start < 0) { \
9212 start += len; \
9213 if (start < 0) \
9214 start = 0; \
9215 }
9216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009218any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009220 Py_ssize_t end,
9221 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009222{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009223 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224 void *buf1, *buf2;
9225 Py_ssize_t len1, len2, result;
9226
9227 kind1 = PyUnicode_KIND(s1);
9228 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009229 if (kind1 < kind2)
9230 return -1;
9231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009232 len1 = PyUnicode_GET_LENGTH(s1);
9233 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009234 ADJUST_INDICES(start, end, len1);
9235 if (end - start < len2)
9236 return -1;
9237
9238 buf1 = PyUnicode_DATA(s1);
9239 buf2 = PyUnicode_DATA(s2);
9240 if (len2 == 1) {
9241 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9242 result = findchar((const char *)buf1 + kind1*start,
9243 kind1, end - start, ch, direction);
9244 if (result == -1)
9245 return -1;
9246 else
9247 return start + result;
9248 }
9249
9250 if (kind2 != kind1) {
9251 buf2 = _PyUnicode_AsKind(s2, kind1);
9252 if (!buf2)
9253 return -2;
9254 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009255
Victor Stinner794d5672011-10-10 03:21:36 +02009256 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009257 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009258 case PyUnicode_1BYTE_KIND:
9259 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9260 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9261 else
9262 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9263 break;
9264 case PyUnicode_2BYTE_KIND:
9265 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9266 break;
9267 case PyUnicode_4BYTE_KIND:
9268 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9269 break;
9270 default:
9271 assert(0); result = -2;
9272 }
9273 }
9274 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009275 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009276 case PyUnicode_1BYTE_KIND:
9277 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9278 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9279 else
9280 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9281 break;
9282 case PyUnicode_2BYTE_KIND:
9283 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9284 break;
9285 case PyUnicode_4BYTE_KIND:
9286 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9287 break;
9288 default:
9289 assert(0); result = -2;
9290 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291 }
9292
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009293 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294 PyMem_Free(buf2);
9295
9296 return result;
9297}
9298
9299Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009300_PyUnicode_InsertThousandsGrouping(
9301 PyObject *unicode, Py_ssize_t index,
9302 Py_ssize_t n_buffer,
9303 void *digits, Py_ssize_t n_digits,
9304 Py_ssize_t min_width,
9305 const char *grouping, PyObject *thousands_sep,
9306 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307{
Victor Stinner41a863c2012-02-24 00:37:51 +01009308 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009309 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009310 Py_ssize_t thousands_sep_len;
9311 Py_ssize_t len;
9312
9313 if (unicode != NULL) {
9314 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009315 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009316 }
9317 else {
9318 kind = PyUnicode_1BYTE_KIND;
9319 data = NULL;
9320 }
9321 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9322 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9323 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9324 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009325 if (thousands_sep_kind < kind) {
9326 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9327 if (!thousands_sep_data)
9328 return -1;
9329 }
9330 else {
9331 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9332 if (!data)
9333 return -1;
9334 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009335 }
9336
Benjamin Petersonead6b532011-12-20 17:23:42 -06009337 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009339 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009340 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009341 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009342 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009343 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009344 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009345 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009346 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009347 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009348 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009349 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009351 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009352 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009353 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009354 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009355 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009357 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009358 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009359 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009360 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009361 break;
9362 default:
9363 assert(0);
9364 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009366 if (unicode != NULL && thousands_sep_kind != kind) {
9367 if (thousands_sep_kind < kind)
9368 PyMem_Free(thousands_sep_data);
9369 else
9370 PyMem_Free(data);
9371 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009372 if (unicode == NULL) {
9373 *maxchar = 127;
9374 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009375 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009376 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009377 }
9378 }
9379 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009380}
9381
9382
Alexander Belopolsky40018472011-02-26 01:02:56 +00009383Py_ssize_t
9384PyUnicode_Count(PyObject *str,
9385 PyObject *substr,
9386 Py_ssize_t start,
9387 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009389 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009390 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391 void *buf1 = NULL, *buf2 = NULL;
9392 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009393
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009394 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009395 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009396
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009397 kind1 = PyUnicode_KIND(str);
9398 kind2 = PyUnicode_KIND(substr);
9399 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009400 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009401
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009402 len1 = PyUnicode_GET_LENGTH(str);
9403 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009405 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009406 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009407
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009408 buf1 = PyUnicode_DATA(str);
9409 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009410 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009411 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009412 if (!buf2)
9413 goto onError;
9414 }
9415
9416 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009418 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009419 result = asciilib_count(
9420 ((Py_UCS1*)buf1) + start, end - start,
9421 buf2, len2, PY_SSIZE_T_MAX
9422 );
9423 else
9424 result = ucs1lib_count(
9425 ((Py_UCS1*)buf1) + start, end - start,
9426 buf2, len2, PY_SSIZE_T_MAX
9427 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009428 break;
9429 case PyUnicode_2BYTE_KIND:
9430 result = ucs2lib_count(
9431 ((Py_UCS2*)buf1) + start, end - start,
9432 buf2, len2, PY_SSIZE_T_MAX
9433 );
9434 break;
9435 case PyUnicode_4BYTE_KIND:
9436 result = ucs4lib_count(
9437 ((Py_UCS4*)buf1) + start, end - start,
9438 buf2, len2, PY_SSIZE_T_MAX
9439 );
9440 break;
9441 default:
9442 assert(0); result = 0;
9443 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009444
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009445 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009446 PyMem_Free(buf2);
9447
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009450 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 PyMem_Free(buf2);
9452 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453}
9454
Alexander Belopolsky40018472011-02-26 01:02:56 +00009455Py_ssize_t
9456PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009457 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009458 Py_ssize_t start,
9459 Py_ssize_t end,
9460 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009462 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009463 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009464
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009465 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466}
9467
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468Py_ssize_t
9469PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9470 Py_ssize_t start, Py_ssize_t end,
9471 int direction)
9472{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009474 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 if (PyUnicode_READY(str) == -1)
9476 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009477 if (start < 0 || end < 0) {
9478 PyErr_SetString(PyExc_IndexError, "string index out of range");
9479 return -2;
9480 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481 if (end > PyUnicode_GET_LENGTH(str))
9482 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009483 if (start >= end)
9484 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009486 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9487 kind, end-start, ch, direction);
9488 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009490 else
9491 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492}
9493
Alexander Belopolsky40018472011-02-26 01:02:56 +00009494static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009495tailmatch(PyObject *self,
9496 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009497 Py_ssize_t start,
9498 Py_ssize_t end,
9499 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 int kind_self;
9502 int kind_sub;
9503 void *data_self;
9504 void *data_sub;
9505 Py_ssize_t offset;
9506 Py_ssize_t i;
9507 Py_ssize_t end_sub;
9508
9509 if (PyUnicode_READY(self) == -1 ||
9510 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009511 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9514 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009516 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009518 if (PyUnicode_GET_LENGTH(substring) == 0)
9519 return 1;
9520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521 kind_self = PyUnicode_KIND(self);
9522 data_self = PyUnicode_DATA(self);
9523 kind_sub = PyUnicode_KIND(substring);
9524 data_sub = PyUnicode_DATA(substring);
9525 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9526
9527 if (direction > 0)
9528 offset = end;
9529 else
9530 offset = start;
9531
9532 if (PyUnicode_READ(kind_self, data_self, offset) ==
9533 PyUnicode_READ(kind_sub, data_sub, 0) &&
9534 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9535 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9536 /* If both are of the same kind, memcmp is sufficient */
9537 if (kind_self == kind_sub) {
9538 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009539 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 data_sub,
9541 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009542 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009544 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 else {
9546 /* We do not need to compare 0 and len(substring)-1 because
9547 the if statement above ensured already that they are equal
9548 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 for (i = 1; i < end_sub; ++i) {
9550 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9551 PyUnicode_READ(kind_sub, data_sub, i))
9552 return 0;
9553 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009554 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556 }
9557
9558 return 0;
9559}
9560
Alexander Belopolsky40018472011-02-26 01:02:56 +00009561Py_ssize_t
9562PyUnicode_Tailmatch(PyObject *str,
9563 PyObject *substr,
9564 Py_ssize_t start,
9565 Py_ssize_t end,
9566 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009567{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009568 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009569 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009570
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009571 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572}
9573
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574/* Apply fixfct filter to the Unicode object self and return a
9575 reference to the modified object */
9576
Alexander Belopolsky40018472011-02-26 01:02:56 +00009577static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009578fixup(PyObject *self,
9579 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581 PyObject *u;
9582 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009583 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009585 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009587 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009588 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 /* fix functions return the new maximum character in a string,
9591 if the kind of the resulting unicode object does not change,
9592 everything is fine. Otherwise we need to change the string kind
9593 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009594 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009595
9596 if (maxchar_new == 0) {
9597 /* no changes */;
9598 if (PyUnicode_CheckExact(self)) {
9599 Py_DECREF(u);
9600 Py_INCREF(self);
9601 return self;
9602 }
9603 else
9604 return u;
9605 }
9606
Victor Stinnere6abb482012-05-02 01:15:40 +02009607 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608
Victor Stinnereaab6042011-12-11 22:22:39 +01009609 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009611
9612 /* In case the maximum character changed, we need to
9613 convert the string to the new category. */
9614 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9615 if (v == NULL) {
9616 Py_DECREF(u);
9617 return NULL;
9618 }
9619 if (maxchar_new > maxchar_old) {
9620 /* If the maxchar increased so that the kind changed, not all
9621 characters are representable anymore and we need to fix the
9622 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009623 _PyUnicode_FastCopyCharacters(v, 0,
9624 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009625 maxchar_old = fixfct(v);
9626 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009627 }
9628 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009629 _PyUnicode_FastCopyCharacters(v, 0,
9630 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009632 Py_DECREF(u);
9633 assert(_PyUnicode_CheckConsistency(v, 1));
9634 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635}
9636
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009637static PyObject *
9638ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009640 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9641 char *resdata, *data = PyUnicode_DATA(self);
9642 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009643
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009644 res = PyUnicode_New(len, 127);
9645 if (res == NULL)
9646 return NULL;
9647 resdata = PyUnicode_DATA(res);
9648 if (lower)
9649 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009651 _Py_bytes_upper(resdata, data, len);
9652 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653}
9654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009656handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658 Py_ssize_t j;
9659 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009660 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009661 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009662
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9664
9665 where ! is a negation and \p{xxx} is a character with property xxx.
9666 */
9667 for (j = i - 1; j >= 0; j--) {
9668 c = PyUnicode_READ(kind, data, j);
9669 if (!_PyUnicode_IsCaseIgnorable(c))
9670 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009671 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009672 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9673 if (final_sigma) {
9674 for (j = i + 1; j < length; j++) {
9675 c = PyUnicode_READ(kind, data, j);
9676 if (!_PyUnicode_IsCaseIgnorable(c))
9677 break;
9678 }
9679 final_sigma = j == length || !_PyUnicode_IsCased(c);
9680 }
9681 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682}
9683
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009684static int
9685lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9686 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009688 /* Obscure special case. */
9689 if (c == 0x3A3) {
9690 mapped[0] = handle_capital_sigma(kind, data, length, i);
9691 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009693 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694}
9695
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009696static Py_ssize_t
9697do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009699 Py_ssize_t i, k = 0;
9700 int n_res, j;
9701 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009702
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009703 c = PyUnicode_READ(kind, data, 0);
9704 n_res = _PyUnicode_ToUpperFull(c, mapped);
9705 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009706 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009707 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009708 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009709 for (i = 1; i < length; i++) {
9710 c = PyUnicode_READ(kind, data, i);
9711 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9712 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009713 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009714 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009715 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009716 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009717 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009718}
9719
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009720static Py_ssize_t
9721do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9722 Py_ssize_t i, k = 0;
9723
9724 for (i = 0; i < length; i++) {
9725 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9726 int n_res, j;
9727 if (Py_UNICODE_ISUPPER(c)) {
9728 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9729 }
9730 else if (Py_UNICODE_ISLOWER(c)) {
9731 n_res = _PyUnicode_ToUpperFull(c, mapped);
9732 }
9733 else {
9734 n_res = 1;
9735 mapped[0] = c;
9736 }
9737 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009738 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009739 res[k++] = mapped[j];
9740 }
9741 }
9742 return k;
9743}
9744
9745static Py_ssize_t
9746do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9747 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009749 Py_ssize_t i, k = 0;
9750
9751 for (i = 0; i < length; i++) {
9752 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9753 int n_res, j;
9754 if (lower)
9755 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9756 else
9757 n_res = _PyUnicode_ToUpperFull(c, mapped);
9758 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009759 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009760 res[k++] = mapped[j];
9761 }
9762 }
9763 return k;
9764}
9765
9766static Py_ssize_t
9767do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9768{
9769 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9770}
9771
9772static Py_ssize_t
9773do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9774{
9775 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9776}
9777
Benjamin Petersone51757f2012-01-12 21:10:29 -05009778static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009779do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9780{
9781 Py_ssize_t i, k = 0;
9782
9783 for (i = 0; i < length; i++) {
9784 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9785 Py_UCS4 mapped[3];
9786 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9787 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009788 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009789 res[k++] = mapped[j];
9790 }
9791 }
9792 return k;
9793}
9794
9795static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009796do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9797{
9798 Py_ssize_t i, k = 0;
9799 int previous_is_cased;
9800
9801 previous_is_cased = 0;
9802 for (i = 0; i < length; i++) {
9803 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9804 Py_UCS4 mapped[3];
9805 int n_res, j;
9806
9807 if (previous_is_cased)
9808 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9809 else
9810 n_res = _PyUnicode_ToTitleFull(c, mapped);
9811
9812 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009813 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009814 res[k++] = mapped[j];
9815 }
9816
9817 previous_is_cased = _PyUnicode_IsCased(c);
9818 }
9819 return k;
9820}
9821
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009822static PyObject *
9823case_operation(PyObject *self,
9824 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9825{
9826 PyObject *res = NULL;
9827 Py_ssize_t length, newlength = 0;
9828 int kind, outkind;
9829 void *data, *outdata;
9830 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9831
Benjamin Petersoneea48462012-01-16 14:28:50 -05009832 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009833
9834 kind = PyUnicode_KIND(self);
9835 data = PyUnicode_DATA(self);
9836 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009837 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009838 PyErr_SetString(PyExc_OverflowError, "string is too long");
9839 return NULL;
9840 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009841 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009842 if (tmp == NULL)
9843 return PyErr_NoMemory();
9844 newlength = perform(kind, data, length, tmp, &maxchar);
9845 res = PyUnicode_New(newlength, maxchar);
9846 if (res == NULL)
9847 goto leave;
9848 tmpend = tmp + newlength;
9849 outdata = PyUnicode_DATA(res);
9850 outkind = PyUnicode_KIND(res);
9851 switch (outkind) {
9852 case PyUnicode_1BYTE_KIND:
9853 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9854 break;
9855 case PyUnicode_2BYTE_KIND:
9856 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9857 break;
9858 case PyUnicode_4BYTE_KIND:
9859 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9860 break;
9861 default:
9862 assert(0);
9863 break;
9864 }
9865 leave:
9866 PyMem_FREE(tmp);
9867 return res;
9868}
9869
Tim Peters8ce9f162004-08-27 01:49:32 +00009870PyObject *
9871PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009873 PyObject *res;
9874 PyObject *fseq;
9875 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009876 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009877
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009878 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009879 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009880 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009881 }
9882
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009883 /* NOTE: the following code can't call back into Python code,
9884 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009885 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009886
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009887 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009888 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009889 res = _PyUnicode_JoinArray(separator, items, seqlen);
9890 Py_DECREF(fseq);
9891 return res;
9892}
9893
9894PyObject *
9895_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9896{
9897 PyObject *res = NULL; /* the result */
9898 PyObject *sep = NULL;
9899 Py_ssize_t seplen;
9900 PyObject *item;
9901 Py_ssize_t sz, i, res_offset;
9902 Py_UCS4 maxchar;
9903 Py_UCS4 item_maxchar;
9904 int use_memcpy;
9905 unsigned char *res_data = NULL, *sep_data = NULL;
9906 PyObject *last_obj;
9907 unsigned int kind = 0;
9908
Tim Peters05eba1f2004-08-27 21:32:02 +00009909 /* If empty sequence, return u"". */
9910 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009911 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009912 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009913
Tim Peters05eba1f2004-08-27 21:32:02 +00009914 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009915 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009916 if (seqlen == 1) {
9917 if (PyUnicode_CheckExact(items[0])) {
9918 res = items[0];
9919 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009920 return res;
9921 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009922 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009923 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009924 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009925 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009926 /* Set up sep and seplen */
9927 if (separator == NULL) {
9928 /* fall back to a blank space separator */
9929 sep = PyUnicode_FromOrdinal(' ');
9930 if (!sep)
9931 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009932 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009933 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009934 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009935 else {
9936 if (!PyUnicode_Check(separator)) {
9937 PyErr_Format(PyExc_TypeError,
9938 "separator: expected str instance,"
9939 " %.80s found",
9940 Py_TYPE(separator)->tp_name);
9941 goto onError;
9942 }
9943 if (PyUnicode_READY(separator))
9944 goto onError;
9945 sep = separator;
9946 seplen = PyUnicode_GET_LENGTH(separator);
9947 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9948 /* inc refcount to keep this code path symmetric with the
9949 above case of a blank separator */
9950 Py_INCREF(sep);
9951 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009952 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009953 }
9954
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009955 /* There are at least two things to join, or else we have a subclass
9956 * of str in the sequence.
9957 * Do a pre-pass to figure out the total amount of space we'll
9958 * need (sz), and see whether all argument are strings.
9959 */
9960 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009961#ifdef Py_DEBUG
9962 use_memcpy = 0;
9963#else
9964 use_memcpy = 1;
9965#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009966 for (i = 0; i < seqlen; i++) {
9967 const Py_ssize_t old_sz = sz;
9968 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009969 if (!PyUnicode_Check(item)) {
9970 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009971 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009972 " %.80s found",
9973 i, Py_TYPE(item)->tp_name);
9974 goto onError;
9975 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 if (PyUnicode_READY(item) == -1)
9977 goto onError;
9978 sz += PyUnicode_GET_LENGTH(item);
9979 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009980 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009981 if (i != 0)
9982 sz += seplen;
9983 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9984 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009985 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009986 goto onError;
9987 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009988 if (use_memcpy && last_obj != NULL) {
9989 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9990 use_memcpy = 0;
9991 }
9992 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009993 }
Tim Petersced69f82003-09-16 20:30:58 +00009994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009996 if (res == NULL)
9997 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009998
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009999 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010000#ifdef Py_DEBUG
10001 use_memcpy = 0;
10002#else
10003 if (use_memcpy) {
10004 res_data = PyUnicode_1BYTE_DATA(res);
10005 kind = PyUnicode_KIND(res);
10006 if (seplen != 0)
10007 sep_data = PyUnicode_1BYTE_DATA(sep);
10008 }
10009#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010010 if (use_memcpy) {
10011 for (i = 0; i < seqlen; ++i) {
10012 Py_ssize_t itemlen;
10013 item = items[i];
10014
10015 /* Copy item, and maybe the separator. */
10016 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010017 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010018 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010019 kind * seplen);
10020 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010021 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010022
10023 itemlen = PyUnicode_GET_LENGTH(item);
10024 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010025 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010026 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010027 kind * itemlen);
10028 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010029 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010030 }
10031 assert(res_data == PyUnicode_1BYTE_DATA(res)
10032 + kind * PyUnicode_GET_LENGTH(res));
10033 }
10034 else {
10035 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10036 Py_ssize_t itemlen;
10037 item = items[i];
10038
10039 /* Copy item, and maybe the separator. */
10040 if (i && seplen != 0) {
10041 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10042 res_offset += seplen;
10043 }
10044
10045 itemlen = PyUnicode_GET_LENGTH(item);
10046 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010047 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010048 res_offset += itemlen;
10049 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010050 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010051 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010052 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010054 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010055 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010057
Benjamin Peterson29060642009-01-31 22:14:21 +000010058 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010060 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010061 return NULL;
10062}
10063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064#define FILL(kind, data, value, start, length) \
10065 do { \
10066 Py_ssize_t i_ = 0; \
10067 assert(kind != PyUnicode_WCHAR_KIND); \
10068 switch ((kind)) { \
10069 case PyUnicode_1BYTE_KIND: { \
10070 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010071 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 break; \
10073 } \
10074 case PyUnicode_2BYTE_KIND: { \
10075 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10076 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10077 break; \
10078 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010079 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10081 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10082 break; \
10083 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010084 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 } \
10086 } while (0)
10087
Victor Stinnerd3f08822012-05-29 12:57:52 +020010088void
10089_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10090 Py_UCS4 fill_char)
10091{
10092 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10093 const void *data = PyUnicode_DATA(unicode);
10094 assert(PyUnicode_IS_READY(unicode));
10095 assert(unicode_modifiable(unicode));
10096 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10097 assert(start >= 0);
10098 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10099 FILL(kind, data, fill_char, start, length);
10100}
10101
Victor Stinner3fe55312012-01-04 00:33:50 +010010102Py_ssize_t
10103PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10104 Py_UCS4 fill_char)
10105{
10106 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010107
10108 if (!PyUnicode_Check(unicode)) {
10109 PyErr_BadInternalCall();
10110 return -1;
10111 }
10112 if (PyUnicode_READY(unicode) == -1)
10113 return -1;
10114 if (unicode_check_modifiable(unicode))
10115 return -1;
10116
Victor Stinnerd3f08822012-05-29 12:57:52 +020010117 if (start < 0) {
10118 PyErr_SetString(PyExc_IndexError, "string index out of range");
10119 return -1;
10120 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010121 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10122 PyErr_SetString(PyExc_ValueError,
10123 "fill character is bigger than "
10124 "the string maximum character");
10125 return -1;
10126 }
10127
10128 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10129 length = Py_MIN(maxlen, length);
10130 if (length <= 0)
10131 return 0;
10132
Victor Stinnerd3f08822012-05-29 12:57:52 +020010133 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010134 return length;
10135}
10136
Victor Stinner9310abb2011-10-05 00:59:23 +020010137static PyObject *
10138pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010139 Py_ssize_t left,
10140 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 PyObject *u;
10144 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010145 int kind;
10146 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147
10148 if (left < 0)
10149 left = 0;
10150 if (right < 0)
10151 right = 0;
10152
Victor Stinnerc4b49542011-12-11 22:44:26 +010010153 if (left == 0 && right == 0)
10154 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10157 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010158 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10159 return NULL;
10160 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010162 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010164 if (!u)
10165 return NULL;
10166
10167 kind = PyUnicode_KIND(u);
10168 data = PyUnicode_DATA(u);
10169 if (left)
10170 FILL(kind, data, fill, 0, left);
10171 if (right)
10172 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010173 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010174 assert(_PyUnicode_CheckConsistency(u, 1));
10175 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176}
10177
Alexander Belopolsky40018472011-02-26 01:02:56 +000010178PyObject *
10179PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010183 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010184 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010185
Benjamin Petersonead6b532011-12-20 17:23:42 -060010186 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010188 if (PyUnicode_IS_ASCII(string))
10189 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010190 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010191 PyUnicode_GET_LENGTH(string), keepends);
10192 else
10193 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010194 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010195 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 break;
10197 case PyUnicode_2BYTE_KIND:
10198 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010199 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 PyUnicode_GET_LENGTH(string), keepends);
10201 break;
10202 case PyUnicode_4BYTE_KIND:
10203 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010204 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 PyUnicode_GET_LENGTH(string), keepends);
10206 break;
10207 default:
10208 assert(0);
10209 list = 0;
10210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212}
10213
Alexander Belopolsky40018472011-02-26 01:02:56 +000010214static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010215split(PyObject *self,
10216 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010217 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010218{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010219 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 void *buf1, *buf2;
10221 Py_ssize_t len1, len2;
10222 PyObject* out;
10223
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010225 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 if (PyUnicode_READY(self) == -1)
10228 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010231 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010233 if (PyUnicode_IS_ASCII(self))
10234 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010235 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010236 PyUnicode_GET_LENGTH(self), maxcount
10237 );
10238 else
10239 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010240 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010241 PyUnicode_GET_LENGTH(self), maxcount
10242 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 case PyUnicode_2BYTE_KIND:
10244 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010245 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 PyUnicode_GET_LENGTH(self), maxcount
10247 );
10248 case PyUnicode_4BYTE_KIND:
10249 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010250 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 PyUnicode_GET_LENGTH(self), maxcount
10252 );
10253 default:
10254 assert(0);
10255 return NULL;
10256 }
10257
10258 if (PyUnicode_READY(substring) == -1)
10259 return NULL;
10260
10261 kind1 = PyUnicode_KIND(self);
10262 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 len1 = PyUnicode_GET_LENGTH(self);
10264 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010265 if (kind1 < kind2 || len1 < len2) {
10266 out = PyList_New(1);
10267 if (out == NULL)
10268 return NULL;
10269 Py_INCREF(self);
10270 PyList_SET_ITEM(out, 0, self);
10271 return out;
10272 }
10273 buf1 = PyUnicode_DATA(self);
10274 buf2 = PyUnicode_DATA(substring);
10275 if (kind2 != kind1) {
10276 buf2 = _PyUnicode_AsKind(substring, kind1);
10277 if (!buf2)
10278 return NULL;
10279 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010281 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010283 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10284 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010285 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010286 else
10287 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010288 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 break;
10290 case PyUnicode_2BYTE_KIND:
10291 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010292 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 break;
10294 case PyUnicode_4BYTE_KIND:
10295 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010296 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 break;
10298 default:
10299 out = NULL;
10300 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010301 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 PyMem_Free(buf2);
10303 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304}
10305
Alexander Belopolsky40018472011-02-26 01:02:56 +000010306static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010307rsplit(PyObject *self,
10308 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010309 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010310{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010311 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 void *buf1, *buf2;
10313 Py_ssize_t len1, len2;
10314 PyObject* out;
10315
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010316 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010317 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 if (PyUnicode_READY(self) == -1)
10320 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010323 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010325 if (PyUnicode_IS_ASCII(self))
10326 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010327 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010328 PyUnicode_GET_LENGTH(self), maxcount
10329 );
10330 else
10331 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010332 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010333 PyUnicode_GET_LENGTH(self), maxcount
10334 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 case PyUnicode_2BYTE_KIND:
10336 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010337 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 PyUnicode_GET_LENGTH(self), maxcount
10339 );
10340 case PyUnicode_4BYTE_KIND:
10341 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010342 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 PyUnicode_GET_LENGTH(self), maxcount
10344 );
10345 default:
10346 assert(0);
10347 return NULL;
10348 }
10349
10350 if (PyUnicode_READY(substring) == -1)
10351 return NULL;
10352
10353 kind1 = PyUnicode_KIND(self);
10354 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 len1 = PyUnicode_GET_LENGTH(self);
10356 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010357 if (kind1 < kind2 || len1 < len2) {
10358 out = PyList_New(1);
10359 if (out == NULL)
10360 return NULL;
10361 Py_INCREF(self);
10362 PyList_SET_ITEM(out, 0, self);
10363 return out;
10364 }
10365 buf1 = PyUnicode_DATA(self);
10366 buf2 = PyUnicode_DATA(substring);
10367 if (kind2 != kind1) {
10368 buf2 = _PyUnicode_AsKind(substring, kind1);
10369 if (!buf2)
10370 return NULL;
10371 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010373 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010375 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10376 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010377 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010378 else
10379 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010380 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 break;
10382 case PyUnicode_2BYTE_KIND:
10383 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010384 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 break;
10386 case PyUnicode_4BYTE_KIND:
10387 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010388 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 break;
10390 default:
10391 out = NULL;
10392 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010393 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 PyMem_Free(buf2);
10395 return out;
10396}
10397
10398static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010399anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10400 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010402 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010404 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10405 return asciilib_find(buf1, len1, buf2, len2, offset);
10406 else
10407 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 case PyUnicode_2BYTE_KIND:
10409 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10410 case PyUnicode_4BYTE_KIND:
10411 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10412 }
10413 assert(0);
10414 return -1;
10415}
10416
10417static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010418anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10419 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010421 switch (kind) {
10422 case PyUnicode_1BYTE_KIND:
10423 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10424 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10425 else
10426 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10427 case PyUnicode_2BYTE_KIND:
10428 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10429 case PyUnicode_4BYTE_KIND:
10430 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10431 }
10432 assert(0);
10433 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010434}
10435
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010436static void
10437replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10438 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10439{
10440 int kind = PyUnicode_KIND(u);
10441 void *data = PyUnicode_DATA(u);
10442 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10443 if (kind == PyUnicode_1BYTE_KIND) {
10444 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10445 (Py_UCS1 *)data + len,
10446 u1, u2, maxcount);
10447 }
10448 else if (kind == PyUnicode_2BYTE_KIND) {
10449 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10450 (Py_UCS2 *)data + len,
10451 u1, u2, maxcount);
10452 }
10453 else {
10454 assert(kind == PyUnicode_4BYTE_KIND);
10455 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10456 (Py_UCS4 *)data + len,
10457 u1, u2, maxcount);
10458 }
10459}
10460
Alexander Belopolsky40018472011-02-26 01:02:56 +000010461static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462replace(PyObject *self, PyObject *str1,
10463 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010464{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 PyObject *u;
10466 char *sbuf = PyUnicode_DATA(self);
10467 char *buf1 = PyUnicode_DATA(str1);
10468 char *buf2 = PyUnicode_DATA(str2);
10469 int srelease = 0, release1 = 0, release2 = 0;
10470 int skind = PyUnicode_KIND(self);
10471 int kind1 = PyUnicode_KIND(str1);
10472 int kind2 = PyUnicode_KIND(str2);
10473 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10474 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10475 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010476 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010477 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010478
10479 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010480 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010482 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483
Victor Stinner59de0ee2011-10-07 10:01:28 +020010484 if (str1 == str2)
10485 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486
Victor Stinner49a0a212011-10-12 23:46:10 +020010487 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010488 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10489 if (maxchar < maxchar_str1)
10490 /* substring too wide to be present */
10491 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010492 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10493 /* Replacing str1 with str2 may cause a maxchar reduction in the
10494 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010495 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010496 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010497
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010499 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010501 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010503 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010504 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010505 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010506
Victor Stinner69ed0f42013-04-09 21:48:24 +020010507 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010508 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010509 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010510 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010511 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010513 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010515
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010516 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10517 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010518 }
10519 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 int rkind = skind;
10521 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010522 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 if (kind1 < rkind) {
10525 /* widen substring */
10526 buf1 = _PyUnicode_AsKind(str1, rkind);
10527 if (!buf1) goto error;
10528 release1 = 1;
10529 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010530 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010531 if (i < 0)
10532 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 if (rkind > kind2) {
10534 /* widen replacement */
10535 buf2 = _PyUnicode_AsKind(str2, rkind);
10536 if (!buf2) goto error;
10537 release2 = 1;
10538 }
10539 else if (rkind < kind2) {
10540 /* widen self and buf1 */
10541 rkind = kind2;
10542 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010543 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 sbuf = _PyUnicode_AsKind(self, rkind);
10545 if (!sbuf) goto error;
10546 srelease = 1;
10547 buf1 = _PyUnicode_AsKind(str1, rkind);
10548 if (!buf1) goto error;
10549 release1 = 1;
10550 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010551 u = PyUnicode_New(slen, maxchar);
10552 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010554 assert(PyUnicode_KIND(u) == rkind);
10555 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010556
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010557 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010558 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010559 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010561 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010563
10564 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010565 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010566 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010567 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010568 if (i == -1)
10569 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010570 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010572 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010574 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010575 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010576 }
10577 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010579 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 int rkind = skind;
10581 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010584 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 buf1 = _PyUnicode_AsKind(str1, rkind);
10586 if (!buf1) goto error;
10587 release1 = 1;
10588 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010589 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010590 if (n == 0)
10591 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010593 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 buf2 = _PyUnicode_AsKind(str2, rkind);
10595 if (!buf2) goto error;
10596 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010599 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 rkind = kind2;
10601 sbuf = _PyUnicode_AsKind(self, rkind);
10602 if (!sbuf) goto error;
10603 srelease = 1;
10604 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010605 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 buf1 = _PyUnicode_AsKind(str1, rkind);
10607 if (!buf1) goto error;
10608 release1 = 1;
10609 }
10610 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10611 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010612 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 PyErr_SetString(PyExc_OverflowError,
10614 "replace string is too long");
10615 goto error;
10616 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010617 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010618 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010619 _Py_INCREF_UNICODE_EMPTY();
10620 if (!unicode_empty)
10621 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010622 u = unicode_empty;
10623 goto done;
10624 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010625 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 PyErr_SetString(PyExc_OverflowError,
10627 "replace string is too long");
10628 goto error;
10629 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010630 u = PyUnicode_New(new_size, maxchar);
10631 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010633 assert(PyUnicode_KIND(u) == rkind);
10634 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 ires = i = 0;
10636 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010637 while (n-- > 0) {
10638 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010639 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010640 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010641 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010642 if (j == -1)
10643 break;
10644 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010645 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010646 memcpy(res + rkind * ires,
10647 sbuf + rkind * i,
10648 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010650 }
10651 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010653 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010655 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010657 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010661 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010662 memcpy(res + rkind * ires,
10663 sbuf + rkind * i,
10664 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010665 }
10666 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010667 /* interleave */
10668 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010669 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010671 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010673 if (--n <= 0)
10674 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010675 memcpy(res + rkind * ires,
10676 sbuf + rkind * i,
10677 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 ires++;
10679 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010680 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010681 memcpy(res + rkind * ires,
10682 sbuf + rkind * i,
10683 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010684 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010685 }
10686
10687 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010688 unicode_adjust_maxchar(&u);
10689 if (u == NULL)
10690 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010692
10693 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 if (srelease)
10695 PyMem_FREE(sbuf);
10696 if (release1)
10697 PyMem_FREE(buf1);
10698 if (release2)
10699 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010700 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010702
Benjamin Peterson29060642009-01-31 22:14:21 +000010703 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010704 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 if (srelease)
10706 PyMem_FREE(sbuf);
10707 if (release1)
10708 PyMem_FREE(buf1);
10709 if (release2)
10710 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010711 return unicode_result_unchanged(self);
10712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 error:
10714 if (srelease && sbuf)
10715 PyMem_FREE(sbuf);
10716 if (release1 && buf1)
10717 PyMem_FREE(buf1);
10718 if (release2 && buf2)
10719 PyMem_FREE(buf2);
10720 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010721}
10722
10723/* --- Unicode Object Methods --------------------------------------------- */
10724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010725PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010726 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727\n\
10728Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010729characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730
10731static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010732unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010734 if (PyUnicode_READY(self) == -1)
10735 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010736 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737}
10738
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010739PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010740 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741\n\
10742Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010743have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010744
10745static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010746unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010748 if (PyUnicode_READY(self) == -1)
10749 return NULL;
10750 if (PyUnicode_GET_LENGTH(self) == 0)
10751 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010752 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753}
10754
Benjamin Petersond5890c82012-01-14 13:23:30 -050010755PyDoc_STRVAR(casefold__doc__,
10756 "S.casefold() -> str\n\
10757\n\
10758Return a version of S suitable for caseless comparisons.");
10759
10760static PyObject *
10761unicode_casefold(PyObject *self)
10762{
10763 if (PyUnicode_READY(self) == -1)
10764 return NULL;
10765 if (PyUnicode_IS_ASCII(self))
10766 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010767 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010768}
10769
10770
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010771/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010772
10773static int
10774convert_uc(PyObject *obj, void *addr)
10775{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010777
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010778 if (!PyUnicode_Check(obj)) {
10779 PyErr_Format(PyExc_TypeError,
10780 "The fill character must be a unicode character, "
10781 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010782 return 0;
10783 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010784 if (PyUnicode_READY(obj) < 0)
10785 return 0;
10786 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010787 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010788 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010789 return 0;
10790 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010791 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010792 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010793}
10794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010795PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010796 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010798Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010799done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010800
10801static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010802unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010803{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010804 Py_ssize_t marg, left;
10805 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 Py_UCS4 fillchar = ' ';
10807
Victor Stinnere9a29352011-10-01 02:14:59 +020010808 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010810
Benjamin Petersonbac79492012-01-14 13:34:47 -050010811 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812 return NULL;
10813
Victor Stinnerc4b49542011-12-11 22:44:26 +010010814 if (PyUnicode_GET_LENGTH(self) >= width)
10815 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816
Victor Stinnerc4b49542011-12-11 22:44:26 +010010817 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010818 left = marg / 2 + (marg & width & 1);
10819
Victor Stinner9310abb2011-10-05 00:59:23 +020010820 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821}
10822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823/* This function assumes that str1 and str2 are readied by the caller. */
10824
Marc-André Lemburge5034372000-08-08 08:04:29 +000010825static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010826unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010827{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010828#define COMPARE(TYPE1, TYPE2) \
10829 do { \
10830 TYPE1* p1 = (TYPE1 *)data1; \
10831 TYPE2* p2 = (TYPE2 *)data2; \
10832 TYPE1* end = p1 + len; \
10833 Py_UCS4 c1, c2; \
10834 for (; p1 != end; p1++, p2++) { \
10835 c1 = *p1; \
10836 c2 = *p2; \
10837 if (c1 != c2) \
10838 return (c1 < c2) ? -1 : 1; \
10839 } \
10840 } \
10841 while (0)
10842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 int kind1, kind2;
10844 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010845 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010847 kind1 = PyUnicode_KIND(str1);
10848 kind2 = PyUnicode_KIND(str2);
10849 data1 = PyUnicode_DATA(str1);
10850 data2 = PyUnicode_DATA(str2);
10851 len1 = PyUnicode_GET_LENGTH(str1);
10852 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010853 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010854
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010855 switch(kind1) {
10856 case PyUnicode_1BYTE_KIND:
10857 {
10858 switch(kind2) {
10859 case PyUnicode_1BYTE_KIND:
10860 {
10861 int cmp = memcmp(data1, data2, len);
10862 /* normalize result of memcmp() into the range [-1; 1] */
10863 if (cmp < 0)
10864 return -1;
10865 if (cmp > 0)
10866 return 1;
10867 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010868 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010869 case PyUnicode_2BYTE_KIND:
10870 COMPARE(Py_UCS1, Py_UCS2);
10871 break;
10872 case PyUnicode_4BYTE_KIND:
10873 COMPARE(Py_UCS1, Py_UCS4);
10874 break;
10875 default:
10876 assert(0);
10877 }
10878 break;
10879 }
10880 case PyUnicode_2BYTE_KIND:
10881 {
10882 switch(kind2) {
10883 case PyUnicode_1BYTE_KIND:
10884 COMPARE(Py_UCS2, Py_UCS1);
10885 break;
10886 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010887 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010888 COMPARE(Py_UCS2, Py_UCS2);
10889 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010890 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010891 case PyUnicode_4BYTE_KIND:
10892 COMPARE(Py_UCS2, Py_UCS4);
10893 break;
10894 default:
10895 assert(0);
10896 }
10897 break;
10898 }
10899 case PyUnicode_4BYTE_KIND:
10900 {
10901 switch(kind2) {
10902 case PyUnicode_1BYTE_KIND:
10903 COMPARE(Py_UCS4, Py_UCS1);
10904 break;
10905 case PyUnicode_2BYTE_KIND:
10906 COMPARE(Py_UCS4, Py_UCS2);
10907 break;
10908 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010909 {
10910#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10911 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10912 /* normalize result of wmemcmp() into the range [-1; 1] */
10913 if (cmp < 0)
10914 return -1;
10915 if (cmp > 0)
10916 return 1;
10917#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010918 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010919#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010920 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010921 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010922 default:
10923 assert(0);
10924 }
10925 break;
10926 }
10927 default:
10928 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010929 }
10930
Victor Stinner770e19e2012-10-04 22:59:45 +020010931 if (len1 == len2)
10932 return 0;
10933 if (len1 < len2)
10934 return -1;
10935 else
10936 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010937
10938#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010939}
10940
Benjamin Peterson621b4302016-09-09 13:54:34 -070010941static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010942unicode_compare_eq(PyObject *str1, PyObject *str2)
10943{
10944 int kind;
10945 void *data1, *data2;
10946 Py_ssize_t len;
10947 int cmp;
10948
Victor Stinnere5567ad2012-10-23 02:48:49 +020010949 len = PyUnicode_GET_LENGTH(str1);
10950 if (PyUnicode_GET_LENGTH(str2) != len)
10951 return 0;
10952 kind = PyUnicode_KIND(str1);
10953 if (PyUnicode_KIND(str2) != kind)
10954 return 0;
10955 data1 = PyUnicode_DATA(str1);
10956 data2 = PyUnicode_DATA(str2);
10957
10958 cmp = memcmp(data1, data2, len * kind);
10959 return (cmp == 0);
10960}
10961
10962
Alexander Belopolsky40018472011-02-26 01:02:56 +000010963int
10964PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10967 if (PyUnicode_READY(left) == -1 ||
10968 PyUnicode_READY(right) == -1)
10969 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010970
10971 /* a string is equal to itself */
10972 if (left == right)
10973 return 0;
10974
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010975 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010977 PyErr_Format(PyExc_TypeError,
10978 "Can't compare %.100s and %.100s",
10979 left->ob_type->tp_name,
10980 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981 return -1;
10982}
10983
Martin v. Löwis5b222132007-06-10 09:51:05 +000010984int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010985_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10986{
10987 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10988 if (right_str == NULL)
10989 return -1;
10990 return PyUnicode_Compare(left, right_str);
10991}
10992
10993int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010994PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10995{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 Py_ssize_t i;
10997 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 Py_UCS4 chr;
10999
Victor Stinner910337b2011-10-03 03:20:16 +020011000 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 if (PyUnicode_READY(uni) == -1)
11002 return -1;
11003 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011004 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011005 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011006 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011007 size_t len, len2 = strlen(str);
11008 int cmp;
11009
11010 len = Py_MIN(len1, len2);
11011 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011012 if (cmp != 0) {
11013 if (cmp < 0)
11014 return -1;
11015 else
11016 return 1;
11017 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011018 if (len1 > len2)
11019 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011020 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011021 return -1; /* str is longer */
11022 return 0;
11023 }
11024 else {
11025 void *data = PyUnicode_DATA(uni);
11026 /* Compare Unicode string and source character set string */
11027 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011028 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011029 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11030 /* This check keeps Python strings that end in '\0' from comparing equal
11031 to C strings identical up to that point. */
11032 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11033 return 1; /* uni is longer */
11034 if (str[i])
11035 return -1; /* str is longer */
11036 return 0;
11037 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011038}
11039
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011040
Benjamin Peterson29060642009-01-31 22:14:21 +000011041#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011042 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011043
Alexander Belopolsky40018472011-02-26 01:02:56 +000011044PyObject *
11045PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011046{
11047 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011048 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011049
Victor Stinnere5567ad2012-10-23 02:48:49 +020011050 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11051 Py_RETURN_NOTIMPLEMENTED;
11052
11053 if (PyUnicode_READY(left) == -1 ||
11054 PyUnicode_READY(right) == -1)
11055 return NULL;
11056
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011057 if (left == right) {
11058 switch (op) {
11059 case Py_EQ:
11060 case Py_LE:
11061 case Py_GE:
11062 /* a string is equal to itself */
11063 v = Py_True;
11064 break;
11065 case Py_NE:
11066 case Py_LT:
11067 case Py_GT:
11068 v = Py_False;
11069 break;
11070 default:
11071 PyErr_BadArgument();
11072 return NULL;
11073 }
11074 }
11075 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011076 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011077 result ^= (op == Py_NE);
11078 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011079 }
11080 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011081 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011082
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011083 /* Convert the return value to a Boolean */
11084 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011085 case Py_LE:
11086 v = TEST_COND(result <= 0);
11087 break;
11088 case Py_GE:
11089 v = TEST_COND(result >= 0);
11090 break;
11091 case Py_LT:
11092 v = TEST_COND(result == -1);
11093 break;
11094 case Py_GT:
11095 v = TEST_COND(result == 1);
11096 break;
11097 default:
11098 PyErr_BadArgument();
11099 return NULL;
11100 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011101 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011102 Py_INCREF(v);
11103 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011104}
11105
Alexander Belopolsky40018472011-02-26 01:02:56 +000011106int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011107_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11108{
11109 return unicode_eq(aa, bb);
11110}
11111
11112int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011113PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011114{
Victor Stinner77282cb2013-04-14 19:22:47 +020011115 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116 void *buf1, *buf2;
11117 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011118 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011119
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011120 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011121 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011122 "'in <string>' requires string as left operand, not %.100s",
11123 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011124 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011125 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011126 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011127 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011128 if (ensure_unicode(str) < 0)
11129 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011131 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011132 kind2 = PyUnicode_KIND(substr);
11133 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011134 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011135 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011136 len2 = PyUnicode_GET_LENGTH(substr);
11137 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011138 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011139 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011140 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011141 if (len2 == 1) {
11142 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11143 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011144 return result;
11145 }
11146 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011147 buf2 = _PyUnicode_AsKind(substr, kind1);
11148 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011149 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011150 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151
Victor Stinner77282cb2013-04-14 19:22:47 +020011152 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153 case PyUnicode_1BYTE_KIND:
11154 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11155 break;
11156 case PyUnicode_2BYTE_KIND:
11157 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11158 break;
11159 case PyUnicode_4BYTE_KIND:
11160 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11161 break;
11162 default:
11163 result = -1;
11164 assert(0);
11165 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011166
Victor Stinner77282cb2013-04-14 19:22:47 +020011167 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168 PyMem_Free(buf2);
11169
Guido van Rossum403d68b2000-03-13 15:55:09 +000011170 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011171}
11172
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173/* Concat to string or Unicode object giving a new Unicode object. */
11174
Alexander Belopolsky40018472011-02-26 01:02:56 +000011175PyObject *
11176PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011178 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011179 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011180 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011182 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11183 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184
11185 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011186 if (left == unicode_empty)
11187 return PyUnicode_FromObject(right);
11188 if (right == unicode_empty)
11189 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011191 left_len = PyUnicode_GET_LENGTH(left);
11192 right_len = PyUnicode_GET_LENGTH(right);
11193 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011194 PyErr_SetString(PyExc_OverflowError,
11195 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011196 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011197 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011198 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011199
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011200 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11201 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011202 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011203
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011205 result = PyUnicode_New(new_len, maxchar);
11206 if (result == NULL)
11207 return NULL;
11208 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11209 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11210 assert(_PyUnicode_CheckConsistency(result, 1));
11211 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212}
11213
Walter Dörwald1ab83302007-05-18 17:15:44 +000011214void
Victor Stinner23e56682011-10-03 03:54:37 +020011215PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011216{
Victor Stinner23e56682011-10-03 03:54:37 +020011217 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011218 Py_UCS4 maxchar, maxchar2;
11219 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011220
11221 if (p_left == NULL) {
11222 if (!PyErr_Occurred())
11223 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011224 return;
11225 }
Victor Stinner23e56682011-10-03 03:54:37 +020011226 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011227 if (right == NULL || left == NULL
11228 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011229 if (!PyErr_Occurred())
11230 PyErr_BadInternalCall();
11231 goto error;
11232 }
11233
Benjamin Petersonbac79492012-01-14 13:34:47 -050011234 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011235 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011236 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011237 goto error;
11238
Victor Stinner488fa492011-12-12 00:01:39 +010011239 /* Shortcuts */
11240 if (left == unicode_empty) {
11241 Py_DECREF(left);
11242 Py_INCREF(right);
11243 *p_left = right;
11244 return;
11245 }
11246 if (right == unicode_empty)
11247 return;
11248
11249 left_len = PyUnicode_GET_LENGTH(left);
11250 right_len = PyUnicode_GET_LENGTH(right);
11251 if (left_len > PY_SSIZE_T_MAX - right_len) {
11252 PyErr_SetString(PyExc_OverflowError,
11253 "strings are too large to concat");
11254 goto error;
11255 }
11256 new_len = left_len + right_len;
11257
11258 if (unicode_modifiable(left)
11259 && PyUnicode_CheckExact(right)
11260 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011261 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11262 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011263 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011264 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011265 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11266 {
11267 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011268 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011269 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011270
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011271 /* copy 'right' into the newly allocated area of 'left' */
11272 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011273 }
Victor Stinner488fa492011-12-12 00:01:39 +010011274 else {
11275 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11276 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011277 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011278
Victor Stinner488fa492011-12-12 00:01:39 +010011279 /* Concat the two Unicode strings */
11280 res = PyUnicode_New(new_len, maxchar);
11281 if (res == NULL)
11282 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011283 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11284 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011285 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011286 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011287 }
11288 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011289 return;
11290
11291error:
Victor Stinner488fa492011-12-12 00:01:39 +010011292 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011293}
11294
11295void
11296PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11297{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011298 PyUnicode_Append(pleft, right);
11299 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011300}
11301
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011302/*
11303Wraps stringlib_parse_args_finds() and additionally ensures that the
11304first argument is a unicode object.
11305*/
11306
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011307static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011308parse_args_finds_unicode(const char * function_name, PyObject *args,
11309 PyObject **substring,
11310 Py_ssize_t *start, Py_ssize_t *end)
11311{
11312 if(stringlib_parse_args_finds(function_name, args, substring,
11313 start, end)) {
11314 if (ensure_unicode(*substring) < 0)
11315 return 0;
11316 return 1;
11317 }
11318 return 0;
11319}
11320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011321PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011322 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011324Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011325string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011326interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327
11328static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011329unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011331 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011332 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011333 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011335 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336 void *buf1, *buf2;
11337 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011339 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011340 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 kind1 = PyUnicode_KIND(self);
11343 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011344 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011345 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011347 len1 = PyUnicode_GET_LENGTH(self);
11348 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011350 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011351 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011352
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011353 buf1 = PyUnicode_DATA(self);
11354 buf2 = PyUnicode_DATA(substring);
11355 if (kind2 != kind1) {
11356 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011357 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011358 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011359 }
11360 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 case PyUnicode_1BYTE_KIND:
11362 iresult = ucs1lib_count(
11363 ((Py_UCS1*)buf1) + start, end - start,
11364 buf2, len2, PY_SSIZE_T_MAX
11365 );
11366 break;
11367 case PyUnicode_2BYTE_KIND:
11368 iresult = ucs2lib_count(
11369 ((Py_UCS2*)buf1) + start, end - start,
11370 buf2, len2, PY_SSIZE_T_MAX
11371 );
11372 break;
11373 case PyUnicode_4BYTE_KIND:
11374 iresult = ucs4lib_count(
11375 ((Py_UCS4*)buf1) + start, end - start,
11376 buf2, len2, PY_SSIZE_T_MAX
11377 );
11378 break;
11379 default:
11380 assert(0); iresult = 0;
11381 }
11382
11383 result = PyLong_FromSsize_t(iresult);
11384
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011385 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387
Guido van Rossumd57fd912000-03-10 22:53:23 +000011388 return result;
11389}
11390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011391PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011392 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011394Encode S using the codec registered for encoding. Default encoding\n\
11395is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011396handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011397a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11398'xmlcharrefreplace' as well as any other name registered with\n\
11399codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011400
11401static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011402unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011404 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405 char *encoding = NULL;
11406 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011407
Benjamin Peterson308d6372009-09-18 21:42:35 +000011408 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11409 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011411 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011412}
11413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011414PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011415 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416\n\
11417Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011418If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419
11420static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011421unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011423 Py_ssize_t i, j, line_pos, src_len, incr;
11424 Py_UCS4 ch;
11425 PyObject *u;
11426 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011427 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011429 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011430 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431
Ezio Melotti745d54d2013-11-16 19:10:57 +020011432 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11433 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011434 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435
Antoine Pitrou22425222011-10-04 19:10:51 +020011436 if (PyUnicode_READY(self) == -1)
11437 return NULL;
11438
Thomas Wouters7e474022000-07-16 12:04:32 +000011439 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011440 src_len = PyUnicode_GET_LENGTH(self);
11441 i = j = line_pos = 0;
11442 kind = PyUnicode_KIND(self);
11443 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011444 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011445 for (; i < src_len; i++) {
11446 ch = PyUnicode_READ(kind, src_data, i);
11447 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011448 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011449 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011450 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011451 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011452 goto overflow;
11453 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011454 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011455 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011456 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011458 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011459 goto overflow;
11460 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011462 if (ch == '\n' || ch == '\r')
11463 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011465 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011466 if (!found)
11467 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011468
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011470 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471 if (!u)
11472 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011473 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474
Antoine Pitroue71d5742011-10-04 15:55:09 +020011475 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476
Antoine Pitroue71d5742011-10-04 15:55:09 +020011477 for (; i < src_len; i++) {
11478 ch = PyUnicode_READ(kind, src_data, i);
11479 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011481 incr = tabsize - (line_pos % tabsize);
11482 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011483 FILL(kind, dest_data, ' ', j, incr);
11484 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011485 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011486 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011487 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011488 line_pos++;
11489 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011490 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011491 if (ch == '\n' || ch == '\r')
11492 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011494 }
11495 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011496 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011497
Antoine Pitroue71d5742011-10-04 15:55:09 +020011498 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011499 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11500 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501}
11502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011503PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011504 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505\n\
11506Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011507such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508arguments start and end are interpreted as in slice notation.\n\
11509\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011510Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511
11512static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011515 /* initialize variables to prevent gcc warning */
11516 PyObject *substring = NULL;
11517 Py_ssize_t start = 0;
11518 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011519 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011521 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011524 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011527 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 if (result == -2)
11530 return NULL;
11531
Christian Heimes217cfd12007-12-02 14:31:20 +000011532 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533}
11534
11535static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011536unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011538 void *data;
11539 enum PyUnicode_Kind kind;
11540 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011541
11542 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11543 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011544 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011545 }
11546 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11547 PyErr_SetString(PyExc_IndexError, "string index out of range");
11548 return NULL;
11549 }
11550 kind = PyUnicode_KIND(self);
11551 data = PyUnicode_DATA(self);
11552 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011553 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554}
11555
Guido van Rossumc2504932007-09-18 19:42:40 +000011556/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011557 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011558static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011559unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560{
Guido van Rossumc2504932007-09-18 19:42:40 +000011561 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011562 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011563
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011564#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011565 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011566#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011567 if (_PyUnicode_HASH(self) != -1)
11568 return _PyUnicode_HASH(self);
11569 if (PyUnicode_READY(self) == -1)
11570 return -1;
11571 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011572 /*
11573 We make the hash of the empty string be 0, rather than using
11574 (prefix ^ suffix), since this slightly obfuscates the hash secret
11575 */
11576 if (len == 0) {
11577 _PyUnicode_HASH(self) = 0;
11578 return 0;
11579 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011580 x = _Py_HashBytes(PyUnicode_DATA(self),
11581 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011582 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011583 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584}
11585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011586PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011587 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011589Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590
11591static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011594 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011595 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011596 PyObject *substring = NULL;
11597 Py_ssize_t start = 0;
11598 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011600 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011603 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011606 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608 if (result == -2)
11609 return NULL;
11610
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611 if (result < 0) {
11612 PyErr_SetString(PyExc_ValueError, "substring not found");
11613 return NULL;
11614 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011615
Christian Heimes217cfd12007-12-02 14:31:20 +000011616 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617}
11618
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011619PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011620 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011622Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011623at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624
11625static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011626unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 Py_ssize_t i, length;
11629 int kind;
11630 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631 int cased;
11632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633 if (PyUnicode_READY(self) == -1)
11634 return NULL;
11635 length = PyUnicode_GET_LENGTH(self);
11636 kind = PyUnicode_KIND(self);
11637 data = PyUnicode_DATA(self);
11638
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 if (length == 1)
11641 return PyBool_FromLong(
11642 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011644 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011646 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011647
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649 for (i = 0; i < length; i++) {
11650 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011651
Benjamin Peterson29060642009-01-31 22:14:21 +000011652 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11653 return PyBool_FromLong(0);
11654 else if (!cased && Py_UNICODE_ISLOWER(ch))
11655 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011657 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658}
11659
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011660PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011661 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011663Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011664at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665
11666static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011667unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011669 Py_ssize_t i, length;
11670 int kind;
11671 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672 int cased;
11673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 if (PyUnicode_READY(self) == -1)
11675 return NULL;
11676 length = PyUnicode_GET_LENGTH(self);
11677 kind = PyUnicode_KIND(self);
11678 data = PyUnicode_DATA(self);
11679
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681 if (length == 1)
11682 return PyBool_FromLong(
11683 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011685 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011687 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011688
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 for (i = 0; i < length; i++) {
11691 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011692
Benjamin Peterson29060642009-01-31 22:14:21 +000011693 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11694 return PyBool_FromLong(0);
11695 else if (!cased && Py_UNICODE_ISUPPER(ch))
11696 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011698 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699}
11700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011701PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011702 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011704Return True if S is a titlecased string and there is at least one\n\
11705character in S, i.e. upper- and titlecase characters may only\n\
11706follow uncased characters and lowercase characters only cased ones.\n\
11707Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708
11709static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011710unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 Py_ssize_t i, length;
11713 int kind;
11714 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715 int cased, previous_is_cased;
11716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717 if (PyUnicode_READY(self) == -1)
11718 return NULL;
11719 length = PyUnicode_GET_LENGTH(self);
11720 kind = PyUnicode_KIND(self);
11721 data = PyUnicode_DATA(self);
11722
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011724 if (length == 1) {
11725 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11726 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11727 (Py_UNICODE_ISUPPER(ch) != 0));
11728 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011730 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011732 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011733
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734 cased = 0;
11735 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011736 for (i = 0; i < length; i++) {
11737 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011738
Benjamin Peterson29060642009-01-31 22:14:21 +000011739 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11740 if (previous_is_cased)
11741 return PyBool_FromLong(0);
11742 previous_is_cased = 1;
11743 cased = 1;
11744 }
11745 else if (Py_UNICODE_ISLOWER(ch)) {
11746 if (!previous_is_cased)
11747 return PyBool_FromLong(0);
11748 previous_is_cased = 1;
11749 cased = 1;
11750 }
11751 else
11752 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011754 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755}
11756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011757PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011758 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011760Return True if all characters in S are whitespace\n\
11761and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762
11763static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011764unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 Py_ssize_t i, length;
11767 int kind;
11768 void *data;
11769
11770 if (PyUnicode_READY(self) == -1)
11771 return NULL;
11772 length = PyUnicode_GET_LENGTH(self);
11773 kind = PyUnicode_KIND(self);
11774 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 if (length == 1)
11778 return PyBool_FromLong(
11779 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011781 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011783 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 for (i = 0; i < length; i++) {
11786 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011787 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011788 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011790 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791}
11792
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011793PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011794 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011795\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011796Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011797and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011798
11799static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011800unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011801{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 Py_ssize_t i, length;
11803 int kind;
11804 void *data;
11805
11806 if (PyUnicode_READY(self) == -1)
11807 return NULL;
11808 length = PyUnicode_GET_LENGTH(self);
11809 kind = PyUnicode_KIND(self);
11810 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011811
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011812 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 if (length == 1)
11814 return PyBool_FromLong(
11815 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011816
11817 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011819 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 for (i = 0; i < length; i++) {
11822 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011823 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011824 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011825 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011826}
11827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011828PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011829 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011830\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011831Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011832and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011833
11834static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011835unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011836{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 int kind;
11838 void *data;
11839 Py_ssize_t len, i;
11840
11841 if (PyUnicode_READY(self) == -1)
11842 return NULL;
11843
11844 kind = PyUnicode_KIND(self);
11845 data = PyUnicode_DATA(self);
11846 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011847
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011848 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011849 if (len == 1) {
11850 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11851 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11852 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011853
11854 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011856 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011858 for (i = 0; i < len; i++) {
11859 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011860 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011861 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011862 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011863 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011864}
11865
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011866PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011867 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011869Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011870False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871
11872static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011873unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 Py_ssize_t i, length;
11876 int kind;
11877 void *data;
11878
11879 if (PyUnicode_READY(self) == -1)
11880 return NULL;
11881 length = PyUnicode_GET_LENGTH(self);
11882 kind = PyUnicode_KIND(self);
11883 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 if (length == 1)
11887 return PyBool_FromLong(
11888 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011890 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011892 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 for (i = 0; i < length; i++) {
11895 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011896 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011898 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899}
11900
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011901PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011902 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011904Return True if all characters in S are digits\n\
11905and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906
11907static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011908unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910 Py_ssize_t i, length;
11911 int kind;
11912 void *data;
11913
11914 if (PyUnicode_READY(self) == -1)
11915 return NULL;
11916 length = PyUnicode_GET_LENGTH(self);
11917 kind = PyUnicode_KIND(self);
11918 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 if (length == 1) {
11922 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11923 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11924 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011926 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011928 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 for (i = 0; i < length; i++) {
11931 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011932 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011934 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935}
11936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011937PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011940Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011941False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942
11943static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011944unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 Py_ssize_t i, length;
11947 int kind;
11948 void *data;
11949
11950 if (PyUnicode_READY(self) == -1)
11951 return NULL;
11952 length = PyUnicode_GET_LENGTH(self);
11953 kind = PyUnicode_KIND(self);
11954 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 if (length == 1)
11958 return PyBool_FromLong(
11959 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011961 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011963 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 for (i = 0; i < length; i++) {
11966 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011967 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011969 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970}
11971
Martin v. Löwis47383402007-08-15 07:32:56 +000011972int
11973PyUnicode_IsIdentifier(PyObject *self)
11974{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 int kind;
11976 void *data;
11977 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011978 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 if (PyUnicode_READY(self) == -1) {
11981 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011982 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 }
11984
11985 /* Special case for empty strings */
11986 if (PyUnicode_GET_LENGTH(self) == 0)
11987 return 0;
11988 kind = PyUnicode_KIND(self);
11989 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011990
11991 /* PEP 3131 says that the first character must be in
11992 XID_Start and subsequent characters in XID_Continue,
11993 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011994 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011995 letters, digits, underscore). However, given the current
11996 definition of XID_Start and XID_Continue, it is sufficient
11997 to check just for these, except that _ must be allowed
11998 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012000 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012001 return 0;
12002
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012003 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012005 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012006 return 1;
12007}
12008
12009PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012010 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000012011\n\
12012Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012013to the language definition.\n\
12014\n\
12015Use keyword.iskeyword() to test for reserved identifiers\n\
12016such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012017
12018static PyObject*
12019unicode_isidentifier(PyObject *self)
12020{
12021 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12022}
12023
Georg Brandl559e5d72008-06-11 18:37:52 +000012024PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012025 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012026\n\
12027Return True if all characters in S are considered\n\
12028printable in repr() or S is empty, False otherwise.");
12029
12030static PyObject*
12031unicode_isprintable(PyObject *self)
12032{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 Py_ssize_t i, length;
12034 int kind;
12035 void *data;
12036
12037 if (PyUnicode_READY(self) == -1)
12038 return NULL;
12039 length = PyUnicode_GET_LENGTH(self);
12040 kind = PyUnicode_KIND(self);
12041 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012042
12043 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 if (length == 1)
12045 return PyBool_FromLong(
12046 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 for (i = 0; i < length; i++) {
12049 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012050 Py_RETURN_FALSE;
12051 }
12052 }
12053 Py_RETURN_TRUE;
12054}
12055
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012056PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012057 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058\n\
12059Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012060iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061
12062static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012063unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012065 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066}
12067
Martin v. Löwis18e16552006-02-15 17:27:45 +000012068static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012069unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 if (PyUnicode_READY(self) == -1)
12072 return -1;
12073 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074}
12075
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012076PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012077 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012079Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012080done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081
12082static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012083unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012085 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012086 Py_UCS4 fillchar = ' ';
12087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012088 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089 return NULL;
12090
Benjamin Petersonbac79492012-01-14 13:34:47 -050012091 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012092 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093
Victor Stinnerc4b49542011-12-11 22:44:26 +010012094 if (PyUnicode_GET_LENGTH(self) >= width)
12095 return unicode_result_unchanged(self);
12096
12097 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098}
12099
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012100PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012101 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012103Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104
12105static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012106unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012108 if (PyUnicode_READY(self) == -1)
12109 return NULL;
12110 if (PyUnicode_IS_ASCII(self))
12111 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012112 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113}
12114
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012115#define LEFTSTRIP 0
12116#define RIGHTSTRIP 1
12117#define BOTHSTRIP 2
12118
12119/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012120static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012121
12122#define STRIPNAME(i) (stripformat[i]+3)
12123
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012124/* externally visible for str.strip(unicode) */
12125PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012126_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012127{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 void *data;
12129 int kind;
12130 Py_ssize_t i, j, len;
12131 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012132 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12135 return NULL;
12136
12137 kind = PyUnicode_KIND(self);
12138 data = PyUnicode_DATA(self);
12139 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012140 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12142 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012143 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012144
Benjamin Peterson14339b62009-01-31 16:36:08 +000012145 i = 0;
12146 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012147 while (i < len) {
12148 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12149 if (!BLOOM(sepmask, ch))
12150 break;
12151 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12152 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012153 i++;
12154 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012155 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012156
Benjamin Peterson14339b62009-01-31 16:36:08 +000012157 j = len;
12158 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012159 j--;
12160 while (j >= i) {
12161 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12162 if (!BLOOM(sepmask, ch))
12163 break;
12164 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12165 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012166 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012167 }
12168
Benjamin Peterson29060642009-01-31 22:14:21 +000012169 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012170 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012171
Victor Stinner7931d9a2011-11-04 00:22:48 +010012172 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173}
12174
12175PyObject*
12176PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12177{
12178 unsigned char *data;
12179 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012180 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181
Victor Stinnerde636f32011-10-01 03:55:54 +020012182 if (PyUnicode_READY(self) == -1)
12183 return NULL;
12184
Victor Stinner684d5fd2012-05-03 02:32:34 +020012185 length = PyUnicode_GET_LENGTH(self);
12186 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012187
Victor Stinner684d5fd2012-05-03 02:32:34 +020012188 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012189 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190
Victor Stinnerde636f32011-10-01 03:55:54 +020012191 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012192 PyErr_SetString(PyExc_IndexError, "string index out of range");
12193 return NULL;
12194 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012195 if (start >= length || end < start)
12196 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012197
Victor Stinner684d5fd2012-05-03 02:32:34 +020012198 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012199 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012200 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012201 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012202 }
12203 else {
12204 kind = PyUnicode_KIND(self);
12205 data = PyUnicode_1BYTE_DATA(self);
12206 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012207 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012208 length);
12209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211
12212static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012213do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 Py_ssize_t len, i, j;
12216
12217 if (PyUnicode_READY(self) == -1)
12218 return NULL;
12219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012221
Victor Stinnercc7af722013-04-09 22:39:24 +020012222 if (PyUnicode_IS_ASCII(self)) {
12223 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12224
12225 i = 0;
12226 if (striptype != RIGHTSTRIP) {
12227 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012228 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012229 if (!_Py_ascii_whitespace[ch])
12230 break;
12231 i++;
12232 }
12233 }
12234
12235 j = len;
12236 if (striptype != LEFTSTRIP) {
12237 j--;
12238 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012239 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012240 if (!_Py_ascii_whitespace[ch])
12241 break;
12242 j--;
12243 }
12244 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012245 }
12246 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012247 else {
12248 int kind = PyUnicode_KIND(self);
12249 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012250
Victor Stinnercc7af722013-04-09 22:39:24 +020012251 i = 0;
12252 if (striptype != RIGHTSTRIP) {
12253 while (i < len) {
12254 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12255 if (!Py_UNICODE_ISSPACE(ch))
12256 break;
12257 i++;
12258 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012259 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012260
12261 j = len;
12262 if (striptype != LEFTSTRIP) {
12263 j--;
12264 while (j >= i) {
12265 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12266 if (!Py_UNICODE_ISSPACE(ch))
12267 break;
12268 j--;
12269 }
12270 j++;
12271 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012272 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012273
Victor Stinner7931d9a2011-11-04 00:22:48 +010012274 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275}
12276
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012277
12278static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012279do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012280{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012281 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012282
Serhiy Storchakac6792272013-10-19 21:03:34 +030012283 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012284 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012285
Benjamin Peterson14339b62009-01-31 16:36:08 +000012286 if (sep != NULL && sep != Py_None) {
12287 if (PyUnicode_Check(sep))
12288 return _PyUnicode_XStrip(self, striptype, sep);
12289 else {
12290 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012291 "%s arg must be None or str",
12292 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012293 return NULL;
12294 }
12295 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012296
Benjamin Peterson14339b62009-01-31 16:36:08 +000012297 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012298}
12299
12300
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012301PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012302 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012303\n\
12304Return a copy of the string S with leading and trailing\n\
12305whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012306If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012307
12308static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012309unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012310{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012311 if (PyTuple_GET_SIZE(args) == 0)
12312 return do_strip(self, BOTHSTRIP); /* Common case */
12313 else
12314 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012315}
12316
12317
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012318PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012319 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012320\n\
12321Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012322If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012323
12324static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012325unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012326{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012327 if (PyTuple_GET_SIZE(args) == 0)
12328 return do_strip(self, LEFTSTRIP); /* Common case */
12329 else
12330 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012331}
12332
12333
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012334PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012335 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012336\n\
12337Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012338If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012339
12340static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012341unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012342{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012343 if (PyTuple_GET_SIZE(args) == 0)
12344 return do_strip(self, RIGHTSTRIP); /* Common case */
12345 else
12346 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012347}
12348
12349
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012351unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012352{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012353 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355
Serhiy Storchaka05997252013-01-26 12:14:02 +020012356 if (len < 1)
12357 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358
Victor Stinnerc4b49542011-12-11 22:44:26 +010012359 /* no repeat, return original string */
12360 if (len == 1)
12361 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012362
Benjamin Petersonbac79492012-01-14 13:34:47 -050012363 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364 return NULL;
12365
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012366 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012367 PyErr_SetString(PyExc_OverflowError,
12368 "repeated string is too long");
12369 return NULL;
12370 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012372
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012373 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012374 if (!u)
12375 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012376 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012378 if (PyUnicode_GET_LENGTH(str) == 1) {
12379 const int kind = PyUnicode_KIND(str);
12380 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012381 if (kind == PyUnicode_1BYTE_KIND) {
12382 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012383 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012384 }
12385 else if (kind == PyUnicode_2BYTE_KIND) {
12386 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012387 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012388 ucs2[n] = fill_char;
12389 } else {
12390 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12391 assert(kind == PyUnicode_4BYTE_KIND);
12392 for (n = 0; n < len; ++n)
12393 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012394 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012395 }
12396 else {
12397 /* number of characters copied this far */
12398 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012399 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012400 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012401 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012402 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012403 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012405 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012406 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012408 }
12409
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012410 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012411 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012412}
12413
Alexander Belopolsky40018472011-02-26 01:02:56 +000012414PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012415PyUnicode_Replace(PyObject *str,
12416 PyObject *substr,
12417 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012418 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012419{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012420 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12421 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012422 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012423 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012424}
12425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012426PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012427 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012428\n\
12429Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012430old replaced by new. If the optional argument count is\n\
12431given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012432
12433static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012434unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012435{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012436 PyObject *str1;
12437 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012438 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012439
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012440 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012441 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012442 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012443 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012444 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445}
12446
Alexander Belopolsky40018472011-02-26 01:02:56 +000012447static PyObject *
12448unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012449{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012450 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451 Py_ssize_t isize;
12452 Py_ssize_t osize, squote, dquote, i, o;
12453 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012454 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012455 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012458 return NULL;
12459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012460 isize = PyUnicode_GET_LENGTH(unicode);
12461 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463 /* Compute length of output, quote characters, and
12464 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012465 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012466 max = 127;
12467 squote = dquote = 0;
12468 ikind = PyUnicode_KIND(unicode);
12469 for (i = 0; i < isize; i++) {
12470 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012471 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012473 case '\'': squote++; break;
12474 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012476 incr = 2;
12477 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012478 default:
12479 /* Fast-path ASCII */
12480 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012481 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012483 ;
12484 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012485 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012487 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012488 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012489 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012491 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012493 if (osize > PY_SSIZE_T_MAX - incr) {
12494 PyErr_SetString(PyExc_OverflowError,
12495 "string is too long to generate repr");
12496 return NULL;
12497 }
12498 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499 }
12500
12501 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012502 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012503 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012504 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 if (dquote)
12506 /* Both squote and dquote present. Use squote,
12507 and escape them */
12508 osize += squote;
12509 else
12510 quote = '"';
12511 }
Victor Stinner55c08782013-04-14 18:45:39 +020012512 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012513
12514 repr = PyUnicode_New(osize, max);
12515 if (repr == NULL)
12516 return NULL;
12517 okind = PyUnicode_KIND(repr);
12518 odata = PyUnicode_DATA(repr);
12519
12520 PyUnicode_WRITE(okind, odata, 0, quote);
12521 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012522 if (unchanged) {
12523 _PyUnicode_FastCopyCharacters(repr, 1,
12524 unicode, 0,
12525 isize);
12526 }
12527 else {
12528 for (i = 0, o = 1; i < isize; i++) {
12529 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012530
Victor Stinner55c08782013-04-14 18:45:39 +020012531 /* Escape quotes and backslashes */
12532 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012533 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012535 continue;
12536 }
12537
12538 /* Map special whitespace to '\t', \n', '\r' */
12539 if (ch == '\t') {
12540 PyUnicode_WRITE(okind, odata, o++, '\\');
12541 PyUnicode_WRITE(okind, odata, o++, 't');
12542 }
12543 else if (ch == '\n') {
12544 PyUnicode_WRITE(okind, odata, o++, '\\');
12545 PyUnicode_WRITE(okind, odata, o++, 'n');
12546 }
12547 else if (ch == '\r') {
12548 PyUnicode_WRITE(okind, odata, o++, '\\');
12549 PyUnicode_WRITE(okind, odata, o++, 'r');
12550 }
12551
12552 /* Map non-printable US ASCII to '\xhh' */
12553 else if (ch < ' ' || ch == 0x7F) {
12554 PyUnicode_WRITE(okind, odata, o++, '\\');
12555 PyUnicode_WRITE(okind, odata, o++, 'x');
12556 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12557 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12558 }
12559
12560 /* Copy ASCII characters as-is */
12561 else if (ch < 0x7F) {
12562 PyUnicode_WRITE(okind, odata, o++, ch);
12563 }
12564
12565 /* Non-ASCII characters */
12566 else {
12567 /* Map Unicode whitespace and control characters
12568 (categories Z* and C* except ASCII space)
12569 */
12570 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12571 PyUnicode_WRITE(okind, odata, o++, '\\');
12572 /* Map 8-bit characters to '\xhh' */
12573 if (ch <= 0xff) {
12574 PyUnicode_WRITE(okind, odata, o++, 'x');
12575 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12576 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12577 }
12578 /* Map 16-bit characters to '\uxxxx' */
12579 else if (ch <= 0xffff) {
12580 PyUnicode_WRITE(okind, odata, o++, 'u');
12581 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12582 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12583 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12584 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12585 }
12586 /* Map 21-bit characters to '\U00xxxxxx' */
12587 else {
12588 PyUnicode_WRITE(okind, odata, o++, 'U');
12589 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12590 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12591 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12592 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12593 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12594 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12595 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12596 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12597 }
12598 }
12599 /* Copy characters as-is */
12600 else {
12601 PyUnicode_WRITE(okind, odata, o++, ch);
12602 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012603 }
12604 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012607 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012608 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609}
12610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012611PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012612 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613\n\
12614Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012615such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616arguments start and end are interpreted as in slice notation.\n\
12617\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012618Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619
12620static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012621unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012623 /* initialize variables to prevent gcc warning */
12624 PyObject *substring = NULL;
12625 Py_ssize_t start = 0;
12626 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012627 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012629 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012630 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012632 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012635 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 if (result == -2)
12638 return NULL;
12639
Christian Heimes217cfd12007-12-02 14:31:20 +000012640 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641}
12642
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012643PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012644 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012645\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012646Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012647
12648static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012651 /* initialize variables to prevent gcc warning */
12652 PyObject *substring = NULL;
12653 Py_ssize_t start = 0;
12654 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012655 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012657 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012660 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012663 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 if (result == -2)
12666 return NULL;
12667
Guido van Rossumd57fd912000-03-10 22:53:23 +000012668 if (result < 0) {
12669 PyErr_SetString(PyExc_ValueError, "substring not found");
12670 return NULL;
12671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672
Christian Heimes217cfd12007-12-02 14:31:20 +000012673 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012674}
12675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012676PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012677 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012679Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012680done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012681
12682static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012683unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012685 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012686 Py_UCS4 fillchar = ' ';
12687
Victor Stinnere9a29352011-10-01 02:14:59 +020012688 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012689 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012690
Benjamin Petersonbac79492012-01-14 13:34:47 -050012691 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692 return NULL;
12693
Victor Stinnerc4b49542011-12-11 22:44:26 +010012694 if (PyUnicode_GET_LENGTH(self) >= width)
12695 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696
Victor Stinnerc4b49542011-12-11 22:44:26 +010012697 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698}
12699
Alexander Belopolsky40018472011-02-26 01:02:56 +000012700PyObject *
12701PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012702{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012703 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012704 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012706 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012707}
12708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012709PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012710 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012711\n\
12712Return a list of the words in S, using sep as the\n\
12713delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012714splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012715whitespace string is a separator and empty strings are\n\
12716removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012717
12718static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012719unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012721 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012723 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012724
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012725 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12726 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012727 return NULL;
12728
12729 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012730 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012731
12732 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012733 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012734
12735 PyErr_Format(PyExc_TypeError,
12736 "must be str or None, not %.100s",
12737 Py_TYPE(substring)->tp_name);
12738 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012739}
12740
Thomas Wouters477c8d52006-05-27 19:21:47 +000012741PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012742PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012743{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012744 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012745 int kind1, kind2;
12746 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012747 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012748
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012749 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012750 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012751
Victor Stinner14f8f022011-10-05 20:58:25 +020012752 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 len1 = PyUnicode_GET_LENGTH(str_obj);
12755 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012756 if (kind1 < kind2 || len1 < len2) {
12757 _Py_INCREF_UNICODE_EMPTY();
12758 if (!unicode_empty)
12759 out = NULL;
12760 else {
12761 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12762 Py_DECREF(unicode_empty);
12763 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012764 return out;
12765 }
12766 buf1 = PyUnicode_DATA(str_obj);
12767 buf2 = PyUnicode_DATA(sep_obj);
12768 if (kind2 != kind1) {
12769 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12770 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012771 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012772 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012774 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012775 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012776 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12777 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12778 else
12779 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780 break;
12781 case PyUnicode_2BYTE_KIND:
12782 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12783 break;
12784 case PyUnicode_4BYTE_KIND:
12785 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12786 break;
12787 default:
12788 assert(0);
12789 out = 0;
12790 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012791
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012792 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012793 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012794
12795 return out;
12796}
12797
12798
12799PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012800PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012801{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012802 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012803 int kind1, kind2;
12804 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012805 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012806
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012807 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012808 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012809
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012810 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812 len1 = PyUnicode_GET_LENGTH(str_obj);
12813 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012814 if (kind1 < kind2 || len1 < len2) {
12815 _Py_INCREF_UNICODE_EMPTY();
12816 if (!unicode_empty)
12817 out = NULL;
12818 else {
12819 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12820 Py_DECREF(unicode_empty);
12821 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012822 return out;
12823 }
12824 buf1 = PyUnicode_DATA(str_obj);
12825 buf2 = PyUnicode_DATA(sep_obj);
12826 if (kind2 != kind1) {
12827 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12828 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012829 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012830 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012831
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012832 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012833 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012834 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12835 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12836 else
12837 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012838 break;
12839 case PyUnicode_2BYTE_KIND:
12840 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12841 break;
12842 case PyUnicode_4BYTE_KIND:
12843 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12844 break;
12845 default:
12846 assert(0);
12847 out = 0;
12848 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012849
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012850 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012852
12853 return out;
12854}
12855
12856PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012857 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012858\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012859Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012860the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012861found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012862
12863static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012864unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012865{
Victor Stinner9310abb2011-10-05 00:59:23 +020012866 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012867}
12868
12869PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012870 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012871\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012872Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012873the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012874separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012875
12876static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012877unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012878{
Victor Stinner9310abb2011-10-05 00:59:23 +020012879 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012880}
12881
Alexander Belopolsky40018472011-02-26 01:02:56 +000012882PyObject *
12883PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012884{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012885 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012886 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012887
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012888 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012889}
12890
12891PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012892 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012893\n\
12894Return a list of the words in S, using sep as the\n\
12895delimiter string, starting at the end of the string and\n\
12896working to the front. If maxsplit is given, at most maxsplit\n\
12897splits are done. If sep is not specified, any whitespace string\n\
12898is a separator.");
12899
12900static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012901unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012902{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012903 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012904 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012905 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012906
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012907 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12908 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012909 return NULL;
12910
12911 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012912 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012913
12914 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012915 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012916
12917 PyErr_Format(PyExc_TypeError,
12918 "must be str or None, not %.100s",
12919 Py_TYPE(substring)->tp_name);
12920 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012921}
12922
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012923PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012924 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012925\n\
12926Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012927Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012928is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012929
12930static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012931unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012933 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012934 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012935
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012936 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12937 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012938 return NULL;
12939
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012940 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012941}
12942
12943static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012944PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012945{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012946 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947}
12948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012949PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012950 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012951\n\
12952Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012953and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954
12955static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012956unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012957{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012958 if (PyUnicode_READY(self) == -1)
12959 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012960 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961}
12962
Larry Hastings61272b72014-01-07 12:41:53 -080012963/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012964
Larry Hastings31826802013-10-19 00:09:25 -070012965@staticmethod
12966str.maketrans as unicode_maketrans
12967
12968 x: object
12969
12970 y: unicode=NULL
12971
12972 z: unicode=NULL
12973
12974 /
12975
12976Return a translation table usable for str.translate().
12977
12978If there is only one argument, it must be a dictionary mapping Unicode
12979ordinals (integers) or characters to Unicode ordinals, strings or None.
12980Character keys will be then converted to ordinals.
12981If there are two arguments, they must be strings of equal length, and
12982in the resulting dictionary, each character in x will be mapped to the
12983character at the same position in y. If there is a third argument, it
12984must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012985[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012986
Larry Hastings31826802013-10-19 00:09:25 -070012987static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012988unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030012989/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012990{
Georg Brandlceee0772007-11-27 23:48:05 +000012991 PyObject *new = NULL, *key, *value;
12992 Py_ssize_t i = 0;
12993 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012994
Georg Brandlceee0772007-11-27 23:48:05 +000012995 new = PyDict_New();
12996 if (!new)
12997 return NULL;
12998 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012999 int x_kind, y_kind, z_kind;
13000 void *x_data, *y_data, *z_data;
13001
Georg Brandlceee0772007-11-27 23:48:05 +000013002 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013003 if (!PyUnicode_Check(x)) {
13004 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13005 "be a string if there is a second argument");
13006 goto err;
13007 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013008 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013009 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13010 "arguments must have equal length");
13011 goto err;
13012 }
13013 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014 x_kind = PyUnicode_KIND(x);
13015 y_kind = PyUnicode_KIND(y);
13016 x_data = PyUnicode_DATA(x);
13017 y_data = PyUnicode_DATA(y);
13018 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13019 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013020 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013021 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013022 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013023 if (!value) {
13024 Py_DECREF(key);
13025 goto err;
13026 }
Georg Brandlceee0772007-11-27 23:48:05 +000013027 res = PyDict_SetItem(new, key, value);
13028 Py_DECREF(key);
13029 Py_DECREF(value);
13030 if (res < 0)
13031 goto err;
13032 }
13033 /* create entries for deleting chars in z */
13034 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013035 z_kind = PyUnicode_KIND(z);
13036 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013037 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013038 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013039 if (!key)
13040 goto err;
13041 res = PyDict_SetItem(new, key, Py_None);
13042 Py_DECREF(key);
13043 if (res < 0)
13044 goto err;
13045 }
13046 }
13047 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013048 int kind;
13049 void *data;
13050
Georg Brandlceee0772007-11-27 23:48:05 +000013051 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013052 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013053 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13054 "to maketrans it must be a dict");
13055 goto err;
13056 }
13057 /* copy entries into the new dict, converting string keys to int keys */
13058 while (PyDict_Next(x, &i, &key, &value)) {
13059 if (PyUnicode_Check(key)) {
13060 /* convert string keys to integer keys */
13061 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013062 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013063 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13064 "table must be of length 1");
13065 goto err;
13066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013067 kind = PyUnicode_KIND(key);
13068 data = PyUnicode_DATA(key);
13069 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013070 if (!newkey)
13071 goto err;
13072 res = PyDict_SetItem(new, newkey, value);
13073 Py_DECREF(newkey);
13074 if (res < 0)
13075 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013076 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013077 /* just keep integer keys */
13078 if (PyDict_SetItem(new, key, value) < 0)
13079 goto err;
13080 } else {
13081 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13082 "be strings or integers");
13083 goto err;
13084 }
13085 }
13086 }
13087 return new;
13088 err:
13089 Py_DECREF(new);
13090 return NULL;
13091}
13092
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013093PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013094 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013096Return a copy of the string S in which each character has been mapped\n\
13097through the given translation table. The table must implement\n\
13098lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13099mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13100this operation raises LookupError, the character is left untouched.\n\
13101Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102
13103static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013104unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013106 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107}
13108
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013109PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013110 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013112Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113
13114static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013115unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013117 if (PyUnicode_READY(self) == -1)
13118 return NULL;
13119 if (PyUnicode_IS_ASCII(self))
13120 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013121 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122}
13123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013124PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013125 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013127Pad a numeric string S with zeros on the left, to fill a field\n\
13128of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129
13130static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013131unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013133 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013134 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013135 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013136 int kind;
13137 void *data;
13138 Py_UCS4 chr;
13139
Martin v. Löwis18e16552006-02-15 17:27:45 +000013140 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141 return NULL;
13142
Benjamin Petersonbac79492012-01-14 13:34:47 -050013143 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013144 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145
Victor Stinnerc4b49542011-12-11 22:44:26 +010013146 if (PyUnicode_GET_LENGTH(self) >= width)
13147 return unicode_result_unchanged(self);
13148
13149 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150
13151 u = pad(self, fill, 0, '0');
13152
Walter Dörwald068325e2002-04-15 13:36:47 +000013153 if (u == NULL)
13154 return NULL;
13155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013156 kind = PyUnicode_KIND(u);
13157 data = PyUnicode_DATA(u);
13158 chr = PyUnicode_READ(kind, data, fill);
13159
13160 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013161 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013162 PyUnicode_WRITE(kind, data, 0, chr);
13163 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013164 }
13165
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013166 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013167 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013168}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013169
13170#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013171static PyObject *
13172unicode__decimal2ascii(PyObject *self)
13173{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013174 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013175}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013176#endif
13177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013178PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013179 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013180\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013181Return True if S starts with the specified prefix, False otherwise.\n\
13182With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013183With optional end, stop comparing S at that position.\n\
13184prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013185
13186static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013187unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013188 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013190 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013191 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013192 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013193 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013194 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013195
Jesus Ceaac451502011-04-20 17:09:23 +020013196 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013197 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013198 if (PyTuple_Check(subobj)) {
13199 Py_ssize_t i;
13200 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013201 substring = PyTuple_GET_ITEM(subobj, i);
13202 if (!PyUnicode_Check(substring)) {
13203 PyErr_Format(PyExc_TypeError,
13204 "tuple for startswith must only contain str, "
13205 "not %.100s",
13206 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013207 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013208 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013209 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013210 if (result == -1)
13211 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013212 if (result) {
13213 Py_RETURN_TRUE;
13214 }
13215 }
13216 /* nothing matched */
13217 Py_RETURN_FALSE;
13218 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013219 if (!PyUnicode_Check(subobj)) {
13220 PyErr_Format(PyExc_TypeError,
13221 "startswith first arg must be str or "
13222 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013223 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013224 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013225 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013226 if (result == -1)
13227 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013228 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013229}
13230
13231
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013232PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013233 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013234\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013235Return True if S ends with the specified suffix, False otherwise.\n\
13236With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013237With optional end, stop comparing S at that position.\n\
13238suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013239
13240static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013241unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013242 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013243{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013244 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013245 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013246 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013247 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013248 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013249
Jesus Ceaac451502011-04-20 17:09:23 +020013250 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013251 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013252 if (PyTuple_Check(subobj)) {
13253 Py_ssize_t i;
13254 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013255 substring = PyTuple_GET_ITEM(subobj, i);
13256 if (!PyUnicode_Check(substring)) {
13257 PyErr_Format(PyExc_TypeError,
13258 "tuple for endswith must only contain str, "
13259 "not %.100s",
13260 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013261 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013262 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013263 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013264 if (result == -1)
13265 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013266 if (result) {
13267 Py_RETURN_TRUE;
13268 }
13269 }
13270 Py_RETURN_FALSE;
13271 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013272 if (!PyUnicode_Check(subobj)) {
13273 PyErr_Format(PyExc_TypeError,
13274 "endswith first arg must be str or "
13275 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013276 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013277 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013278 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013279 if (result == -1)
13280 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013281 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013282}
13283
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013284static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013285_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013286{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013287 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13288 writer->data = PyUnicode_DATA(writer->buffer);
13289
13290 if (!writer->readonly) {
13291 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013292 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013293 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013294 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013295 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13296 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13297 writer->kind = PyUnicode_WCHAR_KIND;
13298 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13299
Victor Stinner8f674cc2013-04-17 23:02:17 +020013300 /* Copy-on-write mode: set buffer size to 0 so
13301 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13302 * next write. */
13303 writer->size = 0;
13304 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013305}
13306
Victor Stinnerd3f08822012-05-29 12:57:52 +020013307void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013308_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013309{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013310 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013311
13312 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013313 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013314
13315 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13316 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13317 writer->kind = PyUnicode_WCHAR_KIND;
13318 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013319}
13320
Victor Stinnerd3f08822012-05-29 12:57:52 +020013321int
13322_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13323 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013324{
13325 Py_ssize_t newlen;
13326 PyObject *newbuffer;
13327
Victor Stinner2740e462016-09-06 16:58:36 -070013328 assert(maxchar <= MAX_UNICODE);
13329
Victor Stinnerca9381e2015-09-22 00:58:32 +020013330 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013331 assert((maxchar > writer->maxchar && length >= 0)
13332 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013333
Victor Stinner202fdca2012-05-07 12:47:02 +020013334 if (length > PY_SSIZE_T_MAX - writer->pos) {
13335 PyErr_NoMemory();
13336 return -1;
13337 }
13338 newlen = writer->pos + length;
13339
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013340 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013341
Victor Stinnerd3f08822012-05-29 12:57:52 +020013342 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013343 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013344 if (writer->overallocate
13345 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13346 /* overallocate to limit the number of realloc() */
13347 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013348 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013349 if (newlen < writer->min_length)
13350 newlen = writer->min_length;
13351
Victor Stinnerd3f08822012-05-29 12:57:52 +020013352 writer->buffer = PyUnicode_New(newlen, maxchar);
13353 if (writer->buffer == NULL)
13354 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013355 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013356 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013357 if (writer->overallocate
13358 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13359 /* overallocate to limit the number of realloc() */
13360 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013361 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013362 if (newlen < writer->min_length)
13363 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013364
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013365 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013366 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013367 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013368 newbuffer = PyUnicode_New(newlen, maxchar);
13369 if (newbuffer == NULL)
13370 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013371 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13372 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013373 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013374 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013375 }
13376 else {
13377 newbuffer = resize_compact(writer->buffer, newlen);
13378 if (newbuffer == NULL)
13379 return -1;
13380 }
13381 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013382 }
13383 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013384 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013385 newbuffer = PyUnicode_New(writer->size, maxchar);
13386 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013387 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013388 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13389 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013390 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013391 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013392 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013393 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013394
13395#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013396}
13397
Victor Stinnerca9381e2015-09-22 00:58:32 +020013398int
13399_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13400 enum PyUnicode_Kind kind)
13401{
13402 Py_UCS4 maxchar;
13403
13404 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13405 assert(writer->kind < kind);
13406
13407 switch (kind)
13408 {
13409 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13410 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13411 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13412 default:
13413 assert(0 && "invalid kind");
13414 return -1;
13415 }
13416
13417 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13418}
13419
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013420static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013421_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013422{
Victor Stinner2740e462016-09-06 16:58:36 -070013423 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013424 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13425 return -1;
13426 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13427 writer->pos++;
13428 return 0;
13429}
13430
13431int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013432_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13433{
13434 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13435}
13436
13437int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013438_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13439{
13440 Py_UCS4 maxchar;
13441 Py_ssize_t len;
13442
13443 if (PyUnicode_READY(str) == -1)
13444 return -1;
13445 len = PyUnicode_GET_LENGTH(str);
13446 if (len == 0)
13447 return 0;
13448 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13449 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013450 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013451 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013452 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013453 Py_INCREF(str);
13454 writer->buffer = str;
13455 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013456 writer->pos += len;
13457 return 0;
13458 }
13459 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13460 return -1;
13461 }
13462 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13463 str, 0, len);
13464 writer->pos += len;
13465 return 0;
13466}
13467
Victor Stinnere215d962012-10-06 23:03:36 +020013468int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013469_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13470 Py_ssize_t start, Py_ssize_t end)
13471{
13472 Py_UCS4 maxchar;
13473 Py_ssize_t len;
13474
13475 if (PyUnicode_READY(str) == -1)
13476 return -1;
13477
13478 assert(0 <= start);
13479 assert(end <= PyUnicode_GET_LENGTH(str));
13480 assert(start <= end);
13481
13482 if (end == 0)
13483 return 0;
13484
13485 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13486 return _PyUnicodeWriter_WriteStr(writer, str);
13487
13488 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13489 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13490 else
13491 maxchar = writer->maxchar;
13492 len = end - start;
13493
13494 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13495 return -1;
13496
13497 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13498 str, start, len);
13499 writer->pos += len;
13500 return 0;
13501}
13502
13503int
Victor Stinner4a587072013-11-19 12:54:53 +010013504_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13505 const char *ascii, Py_ssize_t len)
13506{
13507 if (len == -1)
13508 len = strlen(ascii);
13509
13510 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13511
13512 if (writer->buffer == NULL && !writer->overallocate) {
13513 PyObject *str;
13514
13515 str = _PyUnicode_FromASCII(ascii, len);
13516 if (str == NULL)
13517 return -1;
13518
13519 writer->readonly = 1;
13520 writer->buffer = str;
13521 _PyUnicodeWriter_Update(writer);
13522 writer->pos += len;
13523 return 0;
13524 }
13525
13526 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13527 return -1;
13528
13529 switch (writer->kind)
13530 {
13531 case PyUnicode_1BYTE_KIND:
13532 {
13533 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13534 Py_UCS1 *data = writer->data;
13535
Christian Heimesf051e432016-09-13 20:22:02 +020013536 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013537 break;
13538 }
13539 case PyUnicode_2BYTE_KIND:
13540 {
13541 _PyUnicode_CONVERT_BYTES(
13542 Py_UCS1, Py_UCS2,
13543 ascii, ascii + len,
13544 (Py_UCS2 *)writer->data + writer->pos);
13545 break;
13546 }
13547 case PyUnicode_4BYTE_KIND:
13548 {
13549 _PyUnicode_CONVERT_BYTES(
13550 Py_UCS1, Py_UCS4,
13551 ascii, ascii + len,
13552 (Py_UCS4 *)writer->data + writer->pos);
13553 break;
13554 }
13555 default:
13556 assert(0);
13557 }
13558
13559 writer->pos += len;
13560 return 0;
13561}
13562
13563int
13564_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13565 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013566{
13567 Py_UCS4 maxchar;
13568
13569 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13570 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13571 return -1;
13572 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13573 writer->pos += len;
13574 return 0;
13575}
13576
Victor Stinnerd3f08822012-05-29 12:57:52 +020013577PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013578_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013579{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013580 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013581 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013582 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013583 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013584 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013585 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013586 str = writer->buffer;
13587 writer->buffer = NULL;
13588 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13589 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013590 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013591 if (writer->pos == 0) {
13592 Py_CLEAR(writer->buffer);
13593
13594 /* Get the empty Unicode string singleton ('') */
13595 _Py_INCREF_UNICODE_EMPTY();
13596 str = unicode_empty;
Victor Stinner202fdca2012-05-07 12:47:02 +020013597 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013598 else {
13599 str = writer->buffer;
13600 writer->buffer = NULL;
13601
13602 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13603 PyObject *str2;
13604 str2 = resize_compact(str, writer->pos);
13605 if (str2 == NULL)
13606 return NULL;
13607 str = str2;
13608 }
13609 }
13610
Victor Stinner15a0bd32013-07-08 22:29:55 +020013611 assert(_PyUnicode_CheckConsistency(str, 1));
13612 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013613}
13614
Victor Stinnerd3f08822012-05-29 12:57:52 +020013615void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013616_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013617{
13618 Py_CLEAR(writer->buffer);
13619}
13620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013621#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013622
13623PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013624 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013625\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013626Return a formatted version of S, using substitutions from args and kwargs.\n\
13627The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013628
Eric Smith27bbca62010-11-04 17:06:58 +000013629PyDoc_STRVAR(format_map__doc__,
13630 "S.format_map(mapping) -> str\n\
13631\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013632Return a formatted version of S, using substitutions from mapping.\n\
13633The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013634
Eric Smith4a7d76d2008-05-30 18:10:19 +000013635static PyObject *
13636unicode__format__(PyObject* self, PyObject* args)
13637{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013638 PyObject *format_spec;
13639 _PyUnicodeWriter writer;
13640 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013641
13642 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13643 return NULL;
13644
Victor Stinnerd3f08822012-05-29 12:57:52 +020013645 if (PyUnicode_READY(self) == -1)
13646 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013647 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013648 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13649 self, format_spec, 0,
13650 PyUnicode_GET_LENGTH(format_spec));
13651 if (ret == -1) {
13652 _PyUnicodeWriter_Dealloc(&writer);
13653 return NULL;
13654 }
13655 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013656}
13657
Eric Smith8c663262007-08-25 02:26:07 +000013658PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013659 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013660\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013661Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013662
13663static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013664unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013665{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013666 Py_ssize_t size;
13667
13668 /* If it's a compact object, account for base structure +
13669 character data. */
13670 if (PyUnicode_IS_COMPACT_ASCII(v))
13671 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13672 else if (PyUnicode_IS_COMPACT(v))
13673 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013674 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013675 else {
13676 /* If it is a two-block object, account for base object, and
13677 for character block if present. */
13678 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013679 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013680 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013681 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013682 }
13683 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013684 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013685 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013686 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013687 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013688 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013689
13690 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013691}
13692
13693PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013694 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013695
13696static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013697unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013698{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013699 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013700 if (!copy)
13701 return NULL;
13702 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013703}
13704
Guido van Rossumd57fd912000-03-10 22:53:23 +000013705static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013706 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013707 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013708 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13709 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013710 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13711 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013712 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013713 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13714 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13715 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013716 {"expandtabs", (PyCFunction) unicode_expandtabs,
13717 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013718 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013719 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013720 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13721 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13722 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013723 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013724 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13725 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13726 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013727 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013728 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013729 {"splitlines", (PyCFunction) unicode_splitlines,
13730 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013731 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013732 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13733 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13734 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13735 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13736 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13737 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13738 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13739 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13740 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13741 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13742 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13743 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13744 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13745 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013746 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013747 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013748 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013749 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013750 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013751 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013752 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013753 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013754#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013755 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013756 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013757#endif
13758
Benjamin Peterson14339b62009-01-31 16:36:08 +000013759 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013760 {NULL, NULL}
13761};
13762
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013763static PyObject *
13764unicode_mod(PyObject *v, PyObject *w)
13765{
Brian Curtindfc80e32011-08-10 20:28:54 -050013766 if (!PyUnicode_Check(v))
13767 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013768 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013769}
13770
13771static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013772 0, /*nb_add*/
13773 0, /*nb_subtract*/
13774 0, /*nb_multiply*/
13775 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013776};
13777
Guido van Rossumd57fd912000-03-10 22:53:23 +000013778static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013779 (lenfunc) unicode_length, /* sq_length */
13780 PyUnicode_Concat, /* sq_concat */
13781 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13782 (ssizeargfunc) unicode_getitem, /* sq_item */
13783 0, /* sq_slice */
13784 0, /* sq_ass_item */
13785 0, /* sq_ass_slice */
13786 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013787};
13788
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013789static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013790unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013791{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013792 if (PyUnicode_READY(self) == -1)
13793 return NULL;
13794
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013795 if (PyIndex_Check(item)) {
13796 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013797 if (i == -1 && PyErr_Occurred())
13798 return NULL;
13799 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013800 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013801 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013802 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013803 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013804 PyObject *result;
13805 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013806 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013807 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013809 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013810 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013811 return NULL;
13812 }
13813
13814 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013815 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013816 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013817 slicelength == PyUnicode_GET_LENGTH(self)) {
13818 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013819 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013820 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013821 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013822 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013823 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013824 src_kind = PyUnicode_KIND(self);
13825 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013826 if (!PyUnicode_IS_ASCII(self)) {
13827 kind_limit = kind_maxchar_limit(src_kind);
13828 max_char = 0;
13829 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13830 ch = PyUnicode_READ(src_kind, src_data, cur);
13831 if (ch > max_char) {
13832 max_char = ch;
13833 if (max_char >= kind_limit)
13834 break;
13835 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013836 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013837 }
Victor Stinner55c99112011-10-13 01:17:06 +020013838 else
13839 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013840 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013841 if (result == NULL)
13842 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013843 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013844 dest_data = PyUnicode_DATA(result);
13845
13846 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013847 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13848 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013849 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013850 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013851 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013852 } else {
13853 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13854 return NULL;
13855 }
13856}
13857
13858static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013859 (lenfunc)unicode_length, /* mp_length */
13860 (binaryfunc)unicode_subscript, /* mp_subscript */
13861 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013862};
13863
Guido van Rossumd57fd912000-03-10 22:53:23 +000013864
Guido van Rossumd57fd912000-03-10 22:53:23 +000013865/* Helpers for PyUnicode_Format() */
13866
Victor Stinnera47082312012-10-04 02:19:54 +020013867struct unicode_formatter_t {
13868 PyObject *args;
13869 int args_owned;
13870 Py_ssize_t arglen, argidx;
13871 PyObject *dict;
13872
13873 enum PyUnicode_Kind fmtkind;
13874 Py_ssize_t fmtcnt, fmtpos;
13875 void *fmtdata;
13876 PyObject *fmtstr;
13877
13878 _PyUnicodeWriter writer;
13879};
13880
13881struct unicode_format_arg_t {
13882 Py_UCS4 ch;
13883 int flags;
13884 Py_ssize_t width;
13885 int prec;
13886 int sign;
13887};
13888
Guido van Rossumd57fd912000-03-10 22:53:23 +000013889static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013890unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013891{
Victor Stinnera47082312012-10-04 02:19:54 +020013892 Py_ssize_t argidx = ctx->argidx;
13893
13894 if (argidx < ctx->arglen) {
13895 ctx->argidx++;
13896 if (ctx->arglen < 0)
13897 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013898 else
Victor Stinnera47082312012-10-04 02:19:54 +020013899 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013900 }
13901 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013902 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013903 return NULL;
13904}
13905
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013906/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013907
Victor Stinnera47082312012-10-04 02:19:54 +020013908/* Format a float into the writer if the writer is not NULL, or into *p_output
13909 otherwise.
13910
13911 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013912static int
Victor Stinnera47082312012-10-04 02:19:54 +020013913formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13914 PyObject **p_output,
13915 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013916{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013917 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013918 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013919 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013920 int prec;
13921 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013922
Guido van Rossumd57fd912000-03-10 22:53:23 +000013923 x = PyFloat_AsDouble(v);
13924 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013925 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013926
Victor Stinnera47082312012-10-04 02:19:54 +020013927 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013928 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013929 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013930
Victor Stinnera47082312012-10-04 02:19:54 +020013931 if (arg->flags & F_ALT)
13932 dtoa_flags = Py_DTSF_ALT;
13933 else
13934 dtoa_flags = 0;
13935 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013936 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013937 return -1;
13938 len = strlen(p);
13939 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013940 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013941 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013942 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013943 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013944 }
13945 else
13946 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013947 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013948 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013949}
13950
Victor Stinnerd0880d52012-04-27 23:40:13 +020013951/* formatlong() emulates the format codes d, u, o, x and X, and
13952 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13953 * Python's regular ints.
13954 * Return value: a new PyUnicodeObject*, or NULL if error.
13955 * The output string is of the form
13956 * "-"? ("0x" | "0X")? digit+
13957 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13958 * set in flags. The case of hex digits will be correct,
13959 * There will be at least prec digits, zero-filled on the left if
13960 * necessary to get that many.
13961 * val object to be converted
13962 * flags bitmask of format flags; only F_ALT is looked at
13963 * prec minimum number of digits; 0-fill on left if needed
13964 * type a character in [duoxX]; u acts the same as d
13965 *
13966 * CAUTION: o, x and X conversions on regular ints can never
13967 * produce a '-' sign, but can for Python's unbounded ints.
13968 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013969PyObject *
13970_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013971{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013972 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013973 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013974 Py_ssize_t i;
13975 int sign; /* 1 if '-', else 0 */
13976 int len; /* number of characters */
13977 Py_ssize_t llen;
13978 int numdigits; /* len == numnondigits + numdigits */
13979 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013980
Victor Stinnerd0880d52012-04-27 23:40:13 +020013981 /* Avoid exceeding SSIZE_T_MAX */
13982 if (prec > INT_MAX-3) {
13983 PyErr_SetString(PyExc_OverflowError,
13984 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013985 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013986 }
13987
13988 assert(PyLong_Check(val));
13989
13990 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013991 default:
13992 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013993 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013994 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013995 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013996 /* int and int subclasses should print numerically when a numeric */
13997 /* format code is used (see issue18780) */
13998 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013999 break;
14000 case 'o':
14001 numnondigits = 2;
14002 result = PyNumber_ToBase(val, 8);
14003 break;
14004 case 'x':
14005 case 'X':
14006 numnondigits = 2;
14007 result = PyNumber_ToBase(val, 16);
14008 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014009 }
14010 if (!result)
14011 return NULL;
14012
14013 assert(unicode_modifiable(result));
14014 assert(PyUnicode_IS_READY(result));
14015 assert(PyUnicode_IS_ASCII(result));
14016
14017 /* To modify the string in-place, there can only be one reference. */
14018 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014019 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014020 PyErr_BadInternalCall();
14021 return NULL;
14022 }
14023 buf = PyUnicode_DATA(result);
14024 llen = PyUnicode_GET_LENGTH(result);
14025 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014026 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014027 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014028 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014029 return NULL;
14030 }
14031 len = (int)llen;
14032 sign = buf[0] == '-';
14033 numnondigits += sign;
14034 numdigits = len - numnondigits;
14035 assert(numdigits > 0);
14036
14037 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014038 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014039 (type == 'o' || type == 'x' || type == 'X'))) {
14040 assert(buf[sign] == '0');
14041 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14042 buf[sign+1] == 'o');
14043 numnondigits -= 2;
14044 buf += 2;
14045 len -= 2;
14046 if (sign)
14047 buf[0] = '-';
14048 assert(len == numnondigits + numdigits);
14049 assert(numdigits > 0);
14050 }
14051
14052 /* Fill with leading zeroes to meet minimum width. */
14053 if (prec > numdigits) {
14054 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14055 numnondigits + prec);
14056 char *b1;
14057 if (!r1) {
14058 Py_DECREF(result);
14059 return NULL;
14060 }
14061 b1 = PyBytes_AS_STRING(r1);
14062 for (i = 0; i < numnondigits; ++i)
14063 *b1++ = *buf++;
14064 for (i = 0; i < prec - numdigits; i++)
14065 *b1++ = '0';
14066 for (i = 0; i < numdigits; i++)
14067 *b1++ = *buf++;
14068 *b1 = '\0';
14069 Py_DECREF(result);
14070 result = r1;
14071 buf = PyBytes_AS_STRING(result);
14072 len = numnondigits + prec;
14073 }
14074
14075 /* Fix up case for hex conversions. */
14076 if (type == 'X') {
14077 /* Need to convert all lower case letters to upper case.
14078 and need to convert 0x to 0X (and -0x to -0X). */
14079 for (i = 0; i < len; i++)
14080 if (buf[i] >= 'a' && buf[i] <= 'x')
14081 buf[i] -= 'a'-'A';
14082 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014083 if (!PyUnicode_Check(result)
14084 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014085 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014086 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014087 Py_DECREF(result);
14088 result = unicode;
14089 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014090 else if (len != PyUnicode_GET_LENGTH(result)) {
14091 if (PyUnicode_Resize(&result, len) < 0)
14092 Py_CLEAR(result);
14093 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014094 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014095}
14096
Ethan Furmandf3ed242014-01-05 06:50:30 -080014097/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014098 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014099 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014100 * -1 and raise an exception on error */
14101static int
Victor Stinnera47082312012-10-04 02:19:54 +020014102mainformatlong(PyObject *v,
14103 struct unicode_format_arg_t *arg,
14104 PyObject **p_output,
14105 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014106{
14107 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014108 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014109
14110 if (!PyNumber_Check(v))
14111 goto wrongtype;
14112
Ethan Furman9ab74802014-03-21 06:38:46 -070014113 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014114 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014115 if (type == 'o' || type == 'x' || type == 'X') {
14116 iobj = PyNumber_Index(v);
14117 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014118 if (PyErr_ExceptionMatches(PyExc_TypeError))
14119 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014120 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014121 }
14122 }
14123 else {
14124 iobj = PyNumber_Long(v);
14125 if (iobj == NULL ) {
14126 if (PyErr_ExceptionMatches(PyExc_TypeError))
14127 goto wrongtype;
14128 return -1;
14129 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014130 }
14131 assert(PyLong_Check(iobj));
14132 }
14133 else {
14134 iobj = v;
14135 Py_INCREF(iobj);
14136 }
14137
14138 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014139 && arg->width == -1 && arg->prec == -1
14140 && !(arg->flags & (F_SIGN | F_BLANK))
14141 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014142 {
14143 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014144 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014145 int base;
14146
Victor Stinnera47082312012-10-04 02:19:54 +020014147 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014148 {
14149 default:
14150 assert(0 && "'type' not in [diuoxX]");
14151 case 'd':
14152 case 'i':
14153 case 'u':
14154 base = 10;
14155 break;
14156 case 'o':
14157 base = 8;
14158 break;
14159 case 'x':
14160 case 'X':
14161 base = 16;
14162 break;
14163 }
14164
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014165 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14166 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014167 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014168 }
14169 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014170 return 1;
14171 }
14172
Ethan Furmanb95b5612015-01-23 20:05:18 -080014173 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014174 Py_DECREF(iobj);
14175 if (res == NULL)
14176 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014177 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014178 return 0;
14179
14180wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014181 switch(type)
14182 {
14183 case 'o':
14184 case 'x':
14185 case 'X':
14186 PyErr_Format(PyExc_TypeError,
14187 "%%%c format: an integer is required, "
14188 "not %.200s",
14189 type, Py_TYPE(v)->tp_name);
14190 break;
14191 default:
14192 PyErr_Format(PyExc_TypeError,
14193 "%%%c format: a number is required, "
14194 "not %.200s",
14195 type, Py_TYPE(v)->tp_name);
14196 break;
14197 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014198 return -1;
14199}
14200
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014201static Py_UCS4
14202formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014203{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014204 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014205 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014206 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014207 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014208 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014209 goto onError;
14210 }
14211 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014212 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014213 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014214 /* make sure number is a type of integer */
14215 if (!PyLong_Check(v)) {
14216 iobj = PyNumber_Index(v);
14217 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014218 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014219 }
14220 v = iobj;
14221 Py_DECREF(iobj);
14222 }
14223 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014224 x = PyLong_AsLong(v);
14225 if (x == -1 && PyErr_Occurred())
14226 goto onError;
14227
Victor Stinner8faf8212011-12-08 22:14:11 +010014228 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014229 PyErr_SetString(PyExc_OverflowError,
14230 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014231 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014232 }
14233
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014234 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014235 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014236
Benjamin Peterson29060642009-01-31 22:14:21 +000014237 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014238 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014239 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014240 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014241}
14242
Victor Stinnera47082312012-10-04 02:19:54 +020014243/* Parse options of an argument: flags, width, precision.
14244 Handle also "%(name)" syntax.
14245
14246 Return 0 if the argument has been formatted into arg->str.
14247 Return 1 if the argument has been written into ctx->writer,
14248 Raise an exception and return -1 on error. */
14249static int
14250unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14251 struct unicode_format_arg_t *arg)
14252{
14253#define FORMAT_READ(ctx) \
14254 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14255
14256 PyObject *v;
14257
Victor Stinnera47082312012-10-04 02:19:54 +020014258 if (arg->ch == '(') {
14259 /* Get argument value from a dictionary. Example: "%(name)s". */
14260 Py_ssize_t keystart;
14261 Py_ssize_t keylen;
14262 PyObject *key;
14263 int pcount = 1;
14264
14265 if (ctx->dict == NULL) {
14266 PyErr_SetString(PyExc_TypeError,
14267 "format requires a mapping");
14268 return -1;
14269 }
14270 ++ctx->fmtpos;
14271 --ctx->fmtcnt;
14272 keystart = ctx->fmtpos;
14273 /* Skip over balanced parentheses */
14274 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14275 arg->ch = FORMAT_READ(ctx);
14276 if (arg->ch == ')')
14277 --pcount;
14278 else if (arg->ch == '(')
14279 ++pcount;
14280 ctx->fmtpos++;
14281 }
14282 keylen = ctx->fmtpos - keystart - 1;
14283 if (ctx->fmtcnt < 0 || pcount > 0) {
14284 PyErr_SetString(PyExc_ValueError,
14285 "incomplete format key");
14286 return -1;
14287 }
14288 key = PyUnicode_Substring(ctx->fmtstr,
14289 keystart, keystart + keylen);
14290 if (key == NULL)
14291 return -1;
14292 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014293 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014294 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014295 }
14296 ctx->args = PyObject_GetItem(ctx->dict, key);
14297 Py_DECREF(key);
14298 if (ctx->args == NULL)
14299 return -1;
14300 ctx->args_owned = 1;
14301 ctx->arglen = -1;
14302 ctx->argidx = -2;
14303 }
14304
14305 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014306 while (--ctx->fmtcnt >= 0) {
14307 arg->ch = FORMAT_READ(ctx);
14308 ctx->fmtpos++;
14309 switch (arg->ch) {
14310 case '-': arg->flags |= F_LJUST; continue;
14311 case '+': arg->flags |= F_SIGN; continue;
14312 case ' ': arg->flags |= F_BLANK; continue;
14313 case '#': arg->flags |= F_ALT; continue;
14314 case '0': arg->flags |= F_ZERO; continue;
14315 }
14316 break;
14317 }
14318
14319 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014320 if (arg->ch == '*') {
14321 v = unicode_format_getnextarg(ctx);
14322 if (v == NULL)
14323 return -1;
14324 if (!PyLong_Check(v)) {
14325 PyErr_SetString(PyExc_TypeError,
14326 "* wants int");
14327 return -1;
14328 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014329 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014330 if (arg->width == -1 && PyErr_Occurred())
14331 return -1;
14332 if (arg->width < 0) {
14333 arg->flags |= F_LJUST;
14334 arg->width = -arg->width;
14335 }
14336 if (--ctx->fmtcnt >= 0) {
14337 arg->ch = FORMAT_READ(ctx);
14338 ctx->fmtpos++;
14339 }
14340 }
14341 else if (arg->ch >= '0' && arg->ch <= '9') {
14342 arg->width = arg->ch - '0';
14343 while (--ctx->fmtcnt >= 0) {
14344 arg->ch = FORMAT_READ(ctx);
14345 ctx->fmtpos++;
14346 if (arg->ch < '0' || arg->ch > '9')
14347 break;
14348 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14349 mixing signed and unsigned comparison. Since arg->ch is between
14350 '0' and '9', casting to int is safe. */
14351 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14352 PyErr_SetString(PyExc_ValueError,
14353 "width too big");
14354 return -1;
14355 }
14356 arg->width = arg->width*10 + (arg->ch - '0');
14357 }
14358 }
14359
14360 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014361 if (arg->ch == '.') {
14362 arg->prec = 0;
14363 if (--ctx->fmtcnt >= 0) {
14364 arg->ch = FORMAT_READ(ctx);
14365 ctx->fmtpos++;
14366 }
14367 if (arg->ch == '*') {
14368 v = unicode_format_getnextarg(ctx);
14369 if (v == NULL)
14370 return -1;
14371 if (!PyLong_Check(v)) {
14372 PyErr_SetString(PyExc_TypeError,
14373 "* wants int");
14374 return -1;
14375 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014376 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014377 if (arg->prec == -1 && PyErr_Occurred())
14378 return -1;
14379 if (arg->prec < 0)
14380 arg->prec = 0;
14381 if (--ctx->fmtcnt >= 0) {
14382 arg->ch = FORMAT_READ(ctx);
14383 ctx->fmtpos++;
14384 }
14385 }
14386 else if (arg->ch >= '0' && arg->ch <= '9') {
14387 arg->prec = arg->ch - '0';
14388 while (--ctx->fmtcnt >= 0) {
14389 arg->ch = FORMAT_READ(ctx);
14390 ctx->fmtpos++;
14391 if (arg->ch < '0' || arg->ch > '9')
14392 break;
14393 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14394 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014395 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014396 return -1;
14397 }
14398 arg->prec = arg->prec*10 + (arg->ch - '0');
14399 }
14400 }
14401 }
14402
14403 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14404 if (ctx->fmtcnt >= 0) {
14405 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14406 if (--ctx->fmtcnt >= 0) {
14407 arg->ch = FORMAT_READ(ctx);
14408 ctx->fmtpos++;
14409 }
14410 }
14411 }
14412 if (ctx->fmtcnt < 0) {
14413 PyErr_SetString(PyExc_ValueError,
14414 "incomplete format");
14415 return -1;
14416 }
14417 return 0;
14418
14419#undef FORMAT_READ
14420}
14421
14422/* Format one argument. Supported conversion specifiers:
14423
14424 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014425 - "i", "d", "u": int or float
14426 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014427 - "e", "E", "f", "F", "g", "G": float
14428 - "c": int or str (1 character)
14429
Victor Stinner8dbd4212012-12-04 09:30:24 +010014430 When possible, the output is written directly into the Unicode writer
14431 (ctx->writer). A string is created when padding is required.
14432
Victor Stinnera47082312012-10-04 02:19:54 +020014433 Return 0 if the argument has been formatted into *p_str,
14434 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014435 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014436static int
14437unicode_format_arg_format(struct unicode_formatter_t *ctx,
14438 struct unicode_format_arg_t *arg,
14439 PyObject **p_str)
14440{
14441 PyObject *v;
14442 _PyUnicodeWriter *writer = &ctx->writer;
14443
14444 if (ctx->fmtcnt == 0)
14445 ctx->writer.overallocate = 0;
14446
14447 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014448 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014449 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014450 return 1;
14451 }
14452
14453 v = unicode_format_getnextarg(ctx);
14454 if (v == NULL)
14455 return -1;
14456
Victor Stinnera47082312012-10-04 02:19:54 +020014457
14458 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014459 case 's':
14460 case 'r':
14461 case 'a':
14462 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14463 /* Fast path */
14464 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14465 return -1;
14466 return 1;
14467 }
14468
14469 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14470 *p_str = v;
14471 Py_INCREF(*p_str);
14472 }
14473 else {
14474 if (arg->ch == 's')
14475 *p_str = PyObject_Str(v);
14476 else if (arg->ch == 'r')
14477 *p_str = PyObject_Repr(v);
14478 else
14479 *p_str = PyObject_ASCII(v);
14480 }
14481 break;
14482
14483 case 'i':
14484 case 'd':
14485 case 'u':
14486 case 'o':
14487 case 'x':
14488 case 'X':
14489 {
14490 int ret = mainformatlong(v, arg, p_str, writer);
14491 if (ret != 0)
14492 return ret;
14493 arg->sign = 1;
14494 break;
14495 }
14496
14497 case 'e':
14498 case 'E':
14499 case 'f':
14500 case 'F':
14501 case 'g':
14502 case 'G':
14503 if (arg->width == -1 && arg->prec == -1
14504 && !(arg->flags & (F_SIGN | F_BLANK)))
14505 {
14506 /* Fast path */
14507 if (formatfloat(v, arg, NULL, writer) == -1)
14508 return -1;
14509 return 1;
14510 }
14511
14512 arg->sign = 1;
14513 if (formatfloat(v, arg, p_str, NULL) == -1)
14514 return -1;
14515 break;
14516
14517 case 'c':
14518 {
14519 Py_UCS4 ch = formatchar(v);
14520 if (ch == (Py_UCS4) -1)
14521 return -1;
14522 if (arg->width == -1 && arg->prec == -1) {
14523 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014524 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014525 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014526 return 1;
14527 }
14528 *p_str = PyUnicode_FromOrdinal(ch);
14529 break;
14530 }
14531
14532 default:
14533 PyErr_Format(PyExc_ValueError,
14534 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014535 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014536 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14537 (int)arg->ch,
14538 ctx->fmtpos - 1);
14539 return -1;
14540 }
14541 if (*p_str == NULL)
14542 return -1;
14543 assert (PyUnicode_Check(*p_str));
14544 return 0;
14545}
14546
14547static int
14548unicode_format_arg_output(struct unicode_formatter_t *ctx,
14549 struct unicode_format_arg_t *arg,
14550 PyObject *str)
14551{
14552 Py_ssize_t len;
14553 enum PyUnicode_Kind kind;
14554 void *pbuf;
14555 Py_ssize_t pindex;
14556 Py_UCS4 signchar;
14557 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014558 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014559 Py_ssize_t sublen;
14560 _PyUnicodeWriter *writer = &ctx->writer;
14561 Py_UCS4 fill;
14562
14563 fill = ' ';
14564 if (arg->sign && arg->flags & F_ZERO)
14565 fill = '0';
14566
14567 if (PyUnicode_READY(str) == -1)
14568 return -1;
14569
14570 len = PyUnicode_GET_LENGTH(str);
14571 if ((arg->width == -1 || arg->width <= len)
14572 && (arg->prec == -1 || arg->prec >= len)
14573 && !(arg->flags & (F_SIGN | F_BLANK)))
14574 {
14575 /* Fast path */
14576 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14577 return -1;
14578 return 0;
14579 }
14580
14581 /* Truncate the string for "s", "r" and "a" formats
14582 if the precision is set */
14583 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14584 if (arg->prec >= 0 && len > arg->prec)
14585 len = arg->prec;
14586 }
14587
14588 /* Adjust sign and width */
14589 kind = PyUnicode_KIND(str);
14590 pbuf = PyUnicode_DATA(str);
14591 pindex = 0;
14592 signchar = '\0';
14593 if (arg->sign) {
14594 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14595 if (ch == '-' || ch == '+') {
14596 signchar = ch;
14597 len--;
14598 pindex++;
14599 }
14600 else if (arg->flags & F_SIGN)
14601 signchar = '+';
14602 else if (arg->flags & F_BLANK)
14603 signchar = ' ';
14604 else
14605 arg->sign = 0;
14606 }
14607 if (arg->width < len)
14608 arg->width = len;
14609
14610 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014611 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014612 if (!(arg->flags & F_LJUST)) {
14613 if (arg->sign) {
14614 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014615 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014616 }
14617 else {
14618 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014619 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014620 }
14621 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014622 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14623 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014624 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014625 }
14626
Victor Stinnera47082312012-10-04 02:19:54 +020014627 buflen = arg->width;
14628 if (arg->sign && len == arg->width)
14629 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014630 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014631 return -1;
14632
14633 /* Write the sign if needed */
14634 if (arg->sign) {
14635 if (fill != ' ') {
14636 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14637 writer->pos += 1;
14638 }
14639 if (arg->width > len)
14640 arg->width--;
14641 }
14642
14643 /* Write the numeric prefix for "x", "X" and "o" formats
14644 if the alternate form is used.
14645 For example, write "0x" for the "%#x" format. */
14646 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14647 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14648 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14649 if (fill != ' ') {
14650 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14651 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14652 writer->pos += 2;
14653 pindex += 2;
14654 }
14655 arg->width -= 2;
14656 if (arg->width < 0)
14657 arg->width = 0;
14658 len -= 2;
14659 }
14660
14661 /* Pad left with the fill character if needed */
14662 if (arg->width > len && !(arg->flags & F_LJUST)) {
14663 sublen = arg->width - len;
14664 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14665 writer->pos += sublen;
14666 arg->width = len;
14667 }
14668
14669 /* If padding with spaces: write sign if needed and/or numeric prefix if
14670 the alternate form is used */
14671 if (fill == ' ') {
14672 if (arg->sign) {
14673 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14674 writer->pos += 1;
14675 }
14676 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14677 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14678 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14679 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14680 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14681 writer->pos += 2;
14682 pindex += 2;
14683 }
14684 }
14685
14686 /* Write characters */
14687 if (len) {
14688 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14689 str, pindex, len);
14690 writer->pos += len;
14691 }
14692
14693 /* Pad right with the fill character if needed */
14694 if (arg->width > len) {
14695 sublen = arg->width - len;
14696 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14697 writer->pos += sublen;
14698 }
14699 return 0;
14700}
14701
14702/* Helper of PyUnicode_Format(): format one arg.
14703 Return 0 on success, raise an exception and return -1 on error. */
14704static int
14705unicode_format_arg(struct unicode_formatter_t *ctx)
14706{
14707 struct unicode_format_arg_t arg;
14708 PyObject *str;
14709 int ret;
14710
Victor Stinner8dbd4212012-12-04 09:30:24 +010014711 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14712 arg.flags = 0;
14713 arg.width = -1;
14714 arg.prec = -1;
14715 arg.sign = 0;
14716 str = NULL;
14717
Victor Stinnera47082312012-10-04 02:19:54 +020014718 ret = unicode_format_arg_parse(ctx, &arg);
14719 if (ret == -1)
14720 return -1;
14721
14722 ret = unicode_format_arg_format(ctx, &arg, &str);
14723 if (ret == -1)
14724 return -1;
14725
14726 if (ret != 1) {
14727 ret = unicode_format_arg_output(ctx, &arg, str);
14728 Py_DECREF(str);
14729 if (ret == -1)
14730 return -1;
14731 }
14732
14733 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14734 PyErr_SetString(PyExc_TypeError,
14735 "not all arguments converted during string formatting");
14736 return -1;
14737 }
14738 return 0;
14739}
14740
Alexander Belopolsky40018472011-02-26 01:02:56 +000014741PyObject *
14742PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014743{
Victor Stinnera47082312012-10-04 02:19:54 +020014744 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014745
Guido van Rossumd57fd912000-03-10 22:53:23 +000014746 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014747 PyErr_BadInternalCall();
14748 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014749 }
Victor Stinnera47082312012-10-04 02:19:54 +020014750
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014751 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014752 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014753
14754 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014755 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14756 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14757 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14758 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014759
Victor Stinner8f674cc2013-04-17 23:02:17 +020014760 _PyUnicodeWriter_Init(&ctx.writer);
14761 ctx.writer.min_length = ctx.fmtcnt + 100;
14762 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014763
Guido van Rossumd57fd912000-03-10 22:53:23 +000014764 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014765 ctx.arglen = PyTuple_Size(args);
14766 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014767 }
14768 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014769 ctx.arglen = -1;
14770 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014771 }
Victor Stinnera47082312012-10-04 02:19:54 +020014772 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014773 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014774 ctx.dict = args;
14775 else
14776 ctx.dict = NULL;
14777 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014778
Victor Stinnera47082312012-10-04 02:19:54 +020014779 while (--ctx.fmtcnt >= 0) {
14780 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014781 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014782
14783 nonfmtpos = ctx.fmtpos++;
14784 while (ctx.fmtcnt >= 0 &&
14785 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14786 ctx.fmtpos++;
14787 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014788 }
Victor Stinnera47082312012-10-04 02:19:54 +020014789 if (ctx.fmtcnt < 0) {
14790 ctx.fmtpos--;
14791 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014792 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014793
Victor Stinnercfc4c132013-04-03 01:48:39 +020014794 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14795 nonfmtpos, ctx.fmtpos) < 0)
14796 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014797 }
14798 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014799 ctx.fmtpos++;
14800 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014801 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014802 }
14803 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014804
Victor Stinnera47082312012-10-04 02:19:54 +020014805 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014806 PyErr_SetString(PyExc_TypeError,
14807 "not all arguments converted during string formatting");
14808 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014809 }
14810
Victor Stinnera47082312012-10-04 02:19:54 +020014811 if (ctx.args_owned) {
14812 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014813 }
Victor Stinnera47082312012-10-04 02:19:54 +020014814 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014815
Benjamin Peterson29060642009-01-31 22:14:21 +000014816 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014817 _PyUnicodeWriter_Dealloc(&ctx.writer);
14818 if (ctx.args_owned) {
14819 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014820 }
14821 return NULL;
14822}
14823
Jeremy Hylton938ace62002-07-17 16:30:39 +000014824static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014825unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14826
Tim Peters6d6c1a32001-08-02 04:15:00 +000014827static PyObject *
14828unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14829{
Benjamin Peterson29060642009-01-31 22:14:21 +000014830 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014831 static char *kwlist[] = {"object", "encoding", "errors", 0};
14832 char *encoding = NULL;
14833 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014834
Benjamin Peterson14339b62009-01-31 16:36:08 +000014835 if (type != &PyUnicode_Type)
14836 return unicode_subtype_new(type, args, kwds);
14837 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014838 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014839 return NULL;
14840 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014841 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014842 if (encoding == NULL && errors == NULL)
14843 return PyObject_Str(x);
14844 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014845 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014846}
14847
Guido van Rossume023fe02001-08-30 03:12:59 +000014848static PyObject *
14849unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14850{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014851 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014852 Py_ssize_t length, char_size;
14853 int share_wstr, share_utf8;
14854 unsigned int kind;
14855 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014856
Benjamin Peterson14339b62009-01-31 16:36:08 +000014857 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014858
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014859 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014860 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014861 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014862 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014863 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014864 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014865 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014866 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014867
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014868 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014869 if (self == NULL) {
14870 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014871 return NULL;
14872 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014873 kind = PyUnicode_KIND(unicode);
14874 length = PyUnicode_GET_LENGTH(unicode);
14875
14876 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014877#ifdef Py_DEBUG
14878 _PyUnicode_HASH(self) = -1;
14879#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014880 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014881#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014882 _PyUnicode_STATE(self).interned = 0;
14883 _PyUnicode_STATE(self).kind = kind;
14884 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014885 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014886 _PyUnicode_STATE(self).ready = 1;
14887 _PyUnicode_WSTR(self) = NULL;
14888 _PyUnicode_UTF8_LENGTH(self) = 0;
14889 _PyUnicode_UTF8(self) = NULL;
14890 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014891 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014892
14893 share_utf8 = 0;
14894 share_wstr = 0;
14895 if (kind == PyUnicode_1BYTE_KIND) {
14896 char_size = 1;
14897 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14898 share_utf8 = 1;
14899 }
14900 else if (kind == PyUnicode_2BYTE_KIND) {
14901 char_size = 2;
14902 if (sizeof(wchar_t) == 2)
14903 share_wstr = 1;
14904 }
14905 else {
14906 assert(kind == PyUnicode_4BYTE_KIND);
14907 char_size = 4;
14908 if (sizeof(wchar_t) == 4)
14909 share_wstr = 1;
14910 }
14911
14912 /* Ensure we won't overflow the length. */
14913 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14914 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014915 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014916 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014917 data = PyObject_MALLOC((length + 1) * char_size);
14918 if (data == NULL) {
14919 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014920 goto onError;
14921 }
14922
Victor Stinnerc3c74152011-10-02 20:39:55 +020014923 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014924 if (share_utf8) {
14925 _PyUnicode_UTF8_LENGTH(self) = length;
14926 _PyUnicode_UTF8(self) = data;
14927 }
14928 if (share_wstr) {
14929 _PyUnicode_WSTR_LENGTH(self) = length;
14930 _PyUnicode_WSTR(self) = (wchar_t *)data;
14931 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014932
Christian Heimesf051e432016-09-13 20:22:02 +020014933 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014934 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014935 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014936#ifdef Py_DEBUG
14937 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14938#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014939 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014940 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014941
14942onError:
14943 Py_DECREF(unicode);
14944 Py_DECREF(self);
14945 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014946}
14947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014948PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014949"str(object='') -> str\n\
14950str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014951\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014952Create a new string object from the given object. If encoding or\n\
14953errors is specified, then the object must expose a data buffer\n\
14954that will be decoded using the given encoding and error handler.\n\
14955Otherwise, returns the result of object.__str__() (if defined)\n\
14956or repr(object).\n\
14957encoding defaults to sys.getdefaultencoding().\n\
14958errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014959
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014960static PyObject *unicode_iter(PyObject *seq);
14961
Guido van Rossumd57fd912000-03-10 22:53:23 +000014962PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014963 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014964 "str", /* tp_name */
14965 sizeof(PyUnicodeObject), /* tp_size */
14966 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014967 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014968 (destructor)unicode_dealloc, /* tp_dealloc */
14969 0, /* tp_print */
14970 0, /* tp_getattr */
14971 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014972 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014973 unicode_repr, /* tp_repr */
14974 &unicode_as_number, /* tp_as_number */
14975 &unicode_as_sequence, /* tp_as_sequence */
14976 &unicode_as_mapping, /* tp_as_mapping */
14977 (hashfunc) unicode_hash, /* tp_hash*/
14978 0, /* tp_call*/
14979 (reprfunc) unicode_str, /* tp_str */
14980 PyObject_GenericGetAttr, /* tp_getattro */
14981 0, /* tp_setattro */
14982 0, /* tp_as_buffer */
14983 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014984 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014985 unicode_doc, /* tp_doc */
14986 0, /* tp_traverse */
14987 0, /* tp_clear */
14988 PyUnicode_RichCompare, /* tp_richcompare */
14989 0, /* tp_weaklistoffset */
14990 unicode_iter, /* tp_iter */
14991 0, /* tp_iternext */
14992 unicode_methods, /* tp_methods */
14993 0, /* tp_members */
14994 0, /* tp_getset */
14995 &PyBaseObject_Type, /* tp_base */
14996 0, /* tp_dict */
14997 0, /* tp_descr_get */
14998 0, /* tp_descr_set */
14999 0, /* tp_dictoffset */
15000 0, /* tp_init */
15001 0, /* tp_alloc */
15002 unicode_new, /* tp_new */
15003 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015004};
15005
15006/* Initialize the Unicode implementation */
15007
Victor Stinner3a50e702011-10-18 21:21:00 +020015008int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015009{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015010 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015011 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015012 0x000A, /* LINE FEED */
15013 0x000D, /* CARRIAGE RETURN */
15014 0x001C, /* FILE SEPARATOR */
15015 0x001D, /* GROUP SEPARATOR */
15016 0x001E, /* RECORD SEPARATOR */
15017 0x0085, /* NEXT LINE */
15018 0x2028, /* LINE SEPARATOR */
15019 0x2029, /* PARAGRAPH SEPARATOR */
15020 };
15021
Fred Drakee4315f52000-05-09 19:53:39 +000015022 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015023 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015024 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015025 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015026 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015027
Guido van Rossumcacfc072002-05-24 19:01:59 +000015028 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015029 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015030
15031 /* initialize the linebreak bloom filter */
15032 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015033 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015034 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015035
Christian Heimes26532f72013-07-20 14:57:16 +020015036 if (PyType_Ready(&EncodingMapType) < 0)
15037 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015038
Benjamin Petersonc4311282012-10-30 23:21:10 -040015039 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15040 Py_FatalError("Can't initialize field name iterator type");
15041
15042 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15043 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015044
Victor Stinner3a50e702011-10-18 21:21:00 +020015045 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015046}
15047
15048/* Finalize the Unicode implementation */
15049
Christian Heimesa156e092008-02-16 07:38:31 +000015050int
15051PyUnicode_ClearFreeList(void)
15052{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015053 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015054}
15055
Guido van Rossumd57fd912000-03-10 22:53:23 +000015056void
Thomas Wouters78890102000-07-22 19:25:51 +000015057_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015058{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015059 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015060
Serhiy Storchaka05997252013-01-26 12:14:02 +020015061 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015062
Serhiy Storchaka05997252013-01-26 12:14:02 +020015063 for (i = 0; i < 256; i++)
15064 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015065 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015066 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015067}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015068
Walter Dörwald16807132007-05-25 13:52:07 +000015069void
15070PyUnicode_InternInPlace(PyObject **p)
15071{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015072 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015073 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015074#ifdef Py_DEBUG
15075 assert(s != NULL);
15076 assert(_PyUnicode_CHECK(s));
15077#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015078 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015079 return;
15080#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015081 /* If it's a subclass, we don't really know what putting
15082 it in the interned dict might do. */
15083 if (!PyUnicode_CheckExact(s))
15084 return;
15085 if (PyUnicode_CHECK_INTERNED(s))
15086 return;
15087 if (interned == NULL) {
15088 interned = PyDict_New();
15089 if (interned == NULL) {
15090 PyErr_Clear(); /* Don't leave an exception */
15091 return;
15092 }
15093 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015094 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015095 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015096 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015097 if (t == NULL) {
15098 PyErr_Clear();
15099 return;
15100 }
15101 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015102 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015103 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015104 return;
15105 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015106 /* The two references in interned are not counted by refcnt.
15107 The deallocator will take care of this */
15108 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015109 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015110}
15111
15112void
15113PyUnicode_InternImmortal(PyObject **p)
15114{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015115 PyUnicode_InternInPlace(p);
15116 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015117 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015118 Py_INCREF(*p);
15119 }
Walter Dörwald16807132007-05-25 13:52:07 +000015120}
15121
15122PyObject *
15123PyUnicode_InternFromString(const char *cp)
15124{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015125 PyObject *s = PyUnicode_FromString(cp);
15126 if (s == NULL)
15127 return NULL;
15128 PyUnicode_InternInPlace(&s);
15129 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015130}
15131
Alexander Belopolsky40018472011-02-26 01:02:56 +000015132void
15133_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015134{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015135 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015136 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015137 Py_ssize_t i, n;
15138 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015139
Benjamin Peterson14339b62009-01-31 16:36:08 +000015140 if (interned == NULL || !PyDict_Check(interned))
15141 return;
15142 keys = PyDict_Keys(interned);
15143 if (keys == NULL || !PyList_Check(keys)) {
15144 PyErr_Clear();
15145 return;
15146 }
Walter Dörwald16807132007-05-25 13:52:07 +000015147
Benjamin Peterson14339b62009-01-31 16:36:08 +000015148 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15149 detector, interned unicode strings are not forcibly deallocated;
15150 rather, we give them their stolen references back, and then clear
15151 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015152
Benjamin Peterson14339b62009-01-31 16:36:08 +000015153 n = PyList_GET_SIZE(keys);
15154 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015155 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015156 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015157 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015158 if (PyUnicode_READY(s) == -1) {
15159 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015160 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015161 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015162 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015163 case SSTATE_NOT_INTERNED:
15164 /* XXX Shouldn't happen */
15165 break;
15166 case SSTATE_INTERNED_IMMORTAL:
15167 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015168 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015169 break;
15170 case SSTATE_INTERNED_MORTAL:
15171 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015172 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015173 break;
15174 default:
15175 Py_FatalError("Inconsistent interned string state.");
15176 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015177 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015178 }
15179 fprintf(stderr, "total size of all interned strings: "
15180 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15181 "mortal/immortal\n", mortal_size, immortal_size);
15182 Py_DECREF(keys);
15183 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015184 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015185}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015186
15187
15188/********************* Unicode Iterator **************************/
15189
15190typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015191 PyObject_HEAD
15192 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015193 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015194} unicodeiterobject;
15195
15196static void
15197unicodeiter_dealloc(unicodeiterobject *it)
15198{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015199 _PyObject_GC_UNTRACK(it);
15200 Py_XDECREF(it->it_seq);
15201 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015202}
15203
15204static int
15205unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15206{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015207 Py_VISIT(it->it_seq);
15208 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015209}
15210
15211static PyObject *
15212unicodeiter_next(unicodeiterobject *it)
15213{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015214 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015215
Benjamin Peterson14339b62009-01-31 16:36:08 +000015216 assert(it != NULL);
15217 seq = it->it_seq;
15218 if (seq == NULL)
15219 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015220 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015222 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15223 int kind = PyUnicode_KIND(seq);
15224 void *data = PyUnicode_DATA(seq);
15225 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15226 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015227 if (item != NULL)
15228 ++it->it_index;
15229 return item;
15230 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015231
Benjamin Peterson14339b62009-01-31 16:36:08 +000015232 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015233 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015234 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015235}
15236
15237static PyObject *
15238unicodeiter_len(unicodeiterobject *it)
15239{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015240 Py_ssize_t len = 0;
15241 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015242 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015243 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015244}
15245
15246PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15247
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015248static PyObject *
15249unicodeiter_reduce(unicodeiterobject *it)
15250{
15251 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015252 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015253 it->it_seq, it->it_index);
15254 } else {
15255 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15256 if (u == NULL)
15257 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015258 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015259 }
15260}
15261
15262PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15263
15264static PyObject *
15265unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15266{
15267 Py_ssize_t index = PyLong_AsSsize_t(state);
15268 if (index == -1 && PyErr_Occurred())
15269 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015270 if (it->it_seq != NULL) {
15271 if (index < 0)
15272 index = 0;
15273 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15274 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15275 it->it_index = index;
15276 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015277 Py_RETURN_NONE;
15278}
15279
15280PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15281
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015282static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015283 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015284 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015285 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15286 reduce_doc},
15287 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15288 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015289 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015290};
15291
15292PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015293 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15294 "str_iterator", /* tp_name */
15295 sizeof(unicodeiterobject), /* tp_basicsize */
15296 0, /* tp_itemsize */
15297 /* methods */
15298 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15299 0, /* tp_print */
15300 0, /* tp_getattr */
15301 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015302 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015303 0, /* tp_repr */
15304 0, /* tp_as_number */
15305 0, /* tp_as_sequence */
15306 0, /* tp_as_mapping */
15307 0, /* tp_hash */
15308 0, /* tp_call */
15309 0, /* tp_str */
15310 PyObject_GenericGetAttr, /* tp_getattro */
15311 0, /* tp_setattro */
15312 0, /* tp_as_buffer */
15313 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15314 0, /* tp_doc */
15315 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15316 0, /* tp_clear */
15317 0, /* tp_richcompare */
15318 0, /* tp_weaklistoffset */
15319 PyObject_SelfIter, /* tp_iter */
15320 (iternextfunc)unicodeiter_next, /* tp_iternext */
15321 unicodeiter_methods, /* tp_methods */
15322 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015323};
15324
15325static PyObject *
15326unicode_iter(PyObject *seq)
15327{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015328 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015329
Benjamin Peterson14339b62009-01-31 16:36:08 +000015330 if (!PyUnicode_Check(seq)) {
15331 PyErr_BadInternalCall();
15332 return NULL;
15333 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015334 if (PyUnicode_READY(seq) == -1)
15335 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015336 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15337 if (it == NULL)
15338 return NULL;
15339 it->it_index = 0;
15340 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015341 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015342 _PyObject_GC_TRACK(it);
15343 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015344}
15345
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015346
15347size_t
15348Py_UNICODE_strlen(const Py_UNICODE *u)
15349{
15350 int res = 0;
15351 while(*u++)
15352 res++;
15353 return res;
15354}
15355
15356Py_UNICODE*
15357Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15358{
15359 Py_UNICODE *u = s1;
15360 while ((*u++ = *s2++));
15361 return s1;
15362}
15363
15364Py_UNICODE*
15365Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15366{
15367 Py_UNICODE *u = s1;
15368 while ((*u++ = *s2++))
15369 if (n-- == 0)
15370 break;
15371 return s1;
15372}
15373
15374Py_UNICODE*
15375Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15376{
15377 Py_UNICODE *u1 = s1;
15378 u1 += Py_UNICODE_strlen(u1);
15379 Py_UNICODE_strcpy(u1, s2);
15380 return s1;
15381}
15382
15383int
15384Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15385{
15386 while (*s1 && *s2 && *s1 == *s2)
15387 s1++, s2++;
15388 if (*s1 && *s2)
15389 return (*s1 < *s2) ? -1 : +1;
15390 if (*s1)
15391 return 1;
15392 if (*s2)
15393 return -1;
15394 return 0;
15395}
15396
15397int
15398Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15399{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015400 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015401 for (; n != 0; n--) {
15402 u1 = *s1;
15403 u2 = *s2;
15404 if (u1 != u2)
15405 return (u1 < u2) ? -1 : +1;
15406 if (u1 == '\0')
15407 return 0;
15408 s1++;
15409 s2++;
15410 }
15411 return 0;
15412}
15413
15414Py_UNICODE*
15415Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15416{
15417 const Py_UNICODE *p;
15418 for (p = s; *p; p++)
15419 if (*p == c)
15420 return (Py_UNICODE*)p;
15421 return NULL;
15422}
15423
15424Py_UNICODE*
15425Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15426{
15427 const Py_UNICODE *p;
15428 p = s + Py_UNICODE_strlen(s);
15429 while (p != s) {
15430 p--;
15431 if (*p == c)
15432 return (Py_UNICODE*)p;
15433 }
15434 return NULL;
15435}
Victor Stinner331ea922010-08-10 16:37:20 +000015436
Victor Stinner71133ff2010-09-01 23:43:53 +000015437Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015438PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015439{
Victor Stinner577db2c2011-10-11 22:12:48 +020015440 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015441 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015443 if (!PyUnicode_Check(unicode)) {
15444 PyErr_BadArgument();
15445 return NULL;
15446 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015447 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015448 if (u == NULL)
15449 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015450 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015451 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015452 PyErr_NoMemory();
15453 return NULL;
15454 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015455 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015456 size *= sizeof(Py_UNICODE);
15457 copy = PyMem_Malloc(size);
15458 if (copy == NULL) {
15459 PyErr_NoMemory();
15460 return NULL;
15461 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015462 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015463 return copy;
15464}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015465
Georg Brandl66c221e2010-10-14 07:04:07 +000015466/* A _string module, to export formatter_parser and formatter_field_name_split
15467 to the string.Formatter class implemented in Python. */
15468
15469static PyMethodDef _string_methods[] = {
15470 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15471 METH_O, PyDoc_STR("split the argument as a field name")},
15472 {"formatter_parser", (PyCFunction) formatter_parser,
15473 METH_O, PyDoc_STR("parse the argument as a format string")},
15474 {NULL, NULL}
15475};
15476
15477static struct PyModuleDef _string_module = {
15478 PyModuleDef_HEAD_INIT,
15479 "_string",
15480 PyDoc_STR("string helper module"),
15481 0,
15482 _string_methods,
15483 NULL,
15484 NULL,
15485 NULL,
15486 NULL
15487};
15488
15489PyMODINIT_FUNC
15490PyInit__string(void)
15491{
15492 return PyModule_Create(&_string_module);
15493}
15494
15495
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015496#ifdef __cplusplus
15497}
15498#endif