blob: 86f23c9c7ce804205a234152e9c163c0e0938b74 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
207Py_LOCAL_INLINE(int)
208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200321 }
322 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200323 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 }
325 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200326 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200327 }
328 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200329 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200330 }
331 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200332 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200333 }
334 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200335 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 }
337 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200338 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200339 }
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_OTHER;
341}
342
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300343/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000345Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000346PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000348#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 /* This is actually an illegal character, so it should
352 not be passed to unichr. */
353 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354#endif
355}
356
Victor Stinner910337b2011-10-03 03:20:16 +0200357#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200358int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100359_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200360{
361 PyASCIIObject *ascii;
362 unsigned int kind;
363
364 assert(PyUnicode_Check(op));
365
366 ascii = (PyASCIIObject *)op;
367 kind = ascii->state.kind;
368
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200370 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 assert(ascii->state.ready == 1);
372 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200375 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200376
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 if (ascii->state.compact == 1) {
378 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200379 assert(kind == PyUnicode_1BYTE_KIND
380 || kind == PyUnicode_2BYTE_KIND
381 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200384 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100385 }
386 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389 data = unicode->data.any;
390 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100391 assert(ascii->length == 0);
392 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 assert(ascii->state.compact == 0);
394 assert(ascii->state.ascii == 0);
395 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200397 assert(ascii->wstr != NULL);
398 assert(data == NULL);
399 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 }
401 else {
402 assert(kind == PyUnicode_1BYTE_KIND
403 || kind == PyUnicode_2BYTE_KIND
404 || kind == PyUnicode_4BYTE_KIND);
405 assert(ascii->state.compact == 0);
406 assert(ascii->state.ready == 1);
407 assert(data != NULL);
408 if (ascii->state.ascii) {
409 assert (compact->utf8 == data);
410 assert (compact->utf8_length == ascii->length);
411 }
412 else
413 assert (compact->utf8 != data);
414 }
415 }
416 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200417 if (
418#if SIZEOF_WCHAR_T == 2
419 kind == PyUnicode_2BYTE_KIND
420#else
421 kind == PyUnicode_4BYTE_KIND
422#endif
423 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 {
425 assert(ascii->wstr == data);
426 assert(compact->wstr_length == ascii->length);
427 } else
428 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200429 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200430
431 if (compact->utf8 == NULL)
432 assert(compact->utf8_length == 0);
433 if (ascii->wstr == NULL)
434 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200435 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200436 /* check that the best kind is used */
437 if (check_content && kind != PyUnicode_WCHAR_KIND)
438 {
439 Py_ssize_t i;
440 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200441 void *data;
442 Py_UCS4 ch;
443
444 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 for (i=0; i < ascii->length; i++)
446 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200447 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200448 if (ch > maxchar)
449 maxchar = ch;
450 }
451 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100452 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 255);
455 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200456 else
457 assert(maxchar < 128);
458 }
Victor Stinner77faf692011-11-20 18:56:05 +0100459 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200460 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100461 assert(maxchar <= 0xFFFF);
462 }
463 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100465 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200467 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200468 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400469 return 1;
470}
Victor Stinner910337b2011-10-03 03:20:16 +0200471#endif
472
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100473static PyObject*
474unicode_result_wchar(PyObject *unicode)
475{
476#ifndef Py_DEBUG
477 Py_ssize_t len;
478
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100479 len = _PyUnicode_WSTR_LENGTH(unicode);
480 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200482 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100483 }
484
485 if (len == 1) {
486 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100487 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489 Py_DECREF(unicode);
490 return latin1_char;
491 }
492 }
493
494 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200495 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 return NULL;
497 }
498#else
Victor Stinneraa771272012-10-04 02:32:58 +0200499 assert(Py_REFCNT(unicode) == 1);
500
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 /* don't make the result ready in debug mode to ensure that the caller
502 makes the string ready before using it */
503 assert(_PyUnicode_CheckConsistency(unicode, 1));
504#endif
505 return unicode;
506}
507
508static PyObject*
509unicode_result_ready(PyObject *unicode)
510{
511 Py_ssize_t length;
512
513 length = PyUnicode_GET_LENGTH(unicode);
514 if (length == 0) {
515 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100516 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200517 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 }
519 return unicode_empty;
520 }
521
522 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200523 void *data = PyUnicode_DATA(unicode);
524 int kind = PyUnicode_KIND(unicode);
525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 if (ch < 256) {
527 PyObject *latin1_char = unicode_latin1[ch];
528 if (latin1_char != NULL) {
529 if (unicode != latin1_char) {
530 Py_INCREF(latin1_char);
531 Py_DECREF(unicode);
532 }
533 return latin1_char;
534 }
535 else {
536 assert(_PyUnicode_CheckConsistency(unicode, 1));
537 Py_INCREF(unicode);
538 unicode_latin1[ch] = unicode;
539 return unicode;
540 }
541 }
542 }
543
544 assert(_PyUnicode_CheckConsistency(unicode, 1));
545 return unicode;
546}
547
548static PyObject*
549unicode_result(PyObject *unicode)
550{
551 assert(_PyUnicode_CHECK(unicode));
552 if (PyUnicode_IS_READY(unicode))
553 return unicode_result_ready(unicode);
554 else
555 return unicode_result_wchar(unicode);
556}
557
Victor Stinnerc4b49542011-12-11 22:44:26 +0100558static PyObject*
559unicode_result_unchanged(PyObject *unicode)
560{
561 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500562 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563 return NULL;
564 Py_INCREF(unicode);
565 return unicode;
566 }
567 else
568 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100569 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100570}
571
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573 ASCII, Latin1, UTF-8, etc. */
574static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200575backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577{
Victor Stinnerad771582015-10-09 12:38:53 +0200578 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200579 Py_UCS4 ch;
580 enum PyUnicode_Kind kind;
581 void *data;
582
583 assert(PyUnicode_IS_READY(unicode));
584 kind = PyUnicode_KIND(unicode);
585 data = PyUnicode_DATA(unicode);
586
587 size = 0;
588 /* determine replacement size */
589 for (i = collstart; i < collend; ++i) {
590 Py_ssize_t incr;
591
592 ch = PyUnicode_READ(kind, data, i);
593 if (ch < 0x100)
594 incr = 2+2;
595 else if (ch < 0x10000)
596 incr = 2+4;
597 else {
598 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200599 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200600 }
601 if (size > PY_SSIZE_T_MAX - incr) {
602 PyErr_SetString(PyExc_OverflowError,
603 "encoded result is too long for a Python string");
604 return NULL;
605 }
606 size += incr;
607 }
608
Victor Stinnerad771582015-10-09 12:38:53 +0200609 str = _PyBytesWriter_Prepare(writer, str, size);
610 if (str == NULL)
611 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200612
613 /* generate replacement */
614 for (i = collstart; i < collend; ++i) {
615 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200616 *str++ = '\\';
617 if (ch >= 0x00010000) {
618 *str++ = 'U';
619 *str++ = Py_hexdigits[(ch>>28)&0xf];
620 *str++ = Py_hexdigits[(ch>>24)&0xf];
621 *str++ = Py_hexdigits[(ch>>20)&0xf];
622 *str++ = Py_hexdigits[(ch>>16)&0xf];
623 *str++ = Py_hexdigits[(ch>>12)&0xf];
624 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 }
Victor Stinner797485e2015-10-09 03:17:30 +0200626 else if (ch >= 0x100) {
627 *str++ = 'u';
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
630 }
631 else
632 *str++ = 'x';
633 *str++ = Py_hexdigits[(ch>>4)&0xf];
634 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200635 }
636 return str;
637}
638
639/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640 ASCII, Latin1, UTF-8, etc. */
641static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200642xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644{
Victor Stinnerad771582015-10-09 12:38:53 +0200645 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 Py_UCS4 ch;
647 enum PyUnicode_Kind kind;
648 void *data;
649
650 assert(PyUnicode_IS_READY(unicode));
651 kind = PyUnicode_KIND(unicode);
652 data = PyUnicode_DATA(unicode);
653
654 size = 0;
655 /* determine replacement size */
656 for (i = collstart; i < collend; ++i) {
657 Py_ssize_t incr;
658
659 ch = PyUnicode_READ(kind, data, i);
660 if (ch < 10)
661 incr = 2+1+1;
662 else if (ch < 100)
663 incr = 2+2+1;
664 else if (ch < 1000)
665 incr = 2+3+1;
666 else if (ch < 10000)
667 incr = 2+4+1;
668 else if (ch < 100000)
669 incr = 2+5+1;
670 else if (ch < 1000000)
671 incr = 2+6+1;
672 else {
673 assert(ch <= MAX_UNICODE);
674 incr = 2+7+1;
675 }
676 if (size > PY_SSIZE_T_MAX - incr) {
677 PyErr_SetString(PyExc_OverflowError,
678 "encoded result is too long for a Python string");
679 return NULL;
680 }
681 size += incr;
682 }
683
Victor Stinnerad771582015-10-09 12:38:53 +0200684 str = _PyBytesWriter_Prepare(writer, str, size);
685 if (str == NULL)
686 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200687
688 /* generate replacement */
689 for (i = collstart; i < collend; ++i) {
690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691 }
692 return str;
693}
694
Thomas Wouters477c8d52006-05-27 19:21:47 +0000695/* --- Bloom Filters ----------------------------------------------------- */
696
697/* stuff to implement simple "bloom filters" for Unicode characters.
698 to keep things simple, we use a single bitmask, using the least 5
699 bits from each unicode characters as the bit index. */
700
701/* the linebreak mask is set up by Unicode_Init below */
702
Antoine Pitrouf068f942010-01-13 14:19:12 +0000703#if LONG_BIT >= 128
704#define BLOOM_WIDTH 128
705#elif LONG_BIT >= 64
706#define BLOOM_WIDTH 64
707#elif LONG_BIT >= 32
708#define BLOOM_WIDTH 32
709#else
710#error "LONG_BIT is smaller than 32"
711#endif
712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713#define BLOOM_MASK unsigned long
714
Serhiy Storchaka05997252013-01-26 12:14:02 +0200715static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000716
Antoine Pitrouf068f942010-01-13 14:19:12 +0000717#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718
Benjamin Peterson29060642009-01-31 22:14:21 +0000719#define BLOOM_LINEBREAK(ch) \
720 ((ch) < 128U ? ascii_linebreak[(ch)] : \
721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000722
Alexander Belopolsky40018472011-02-26 01:02:56 +0000723Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000725{
Victor Stinnera85af502013-04-09 21:53:54 +0200726#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
727 do { \
728 TYPE *data = (TYPE *)PTR; \
729 TYPE *end = data + LEN; \
730 Py_UCS4 ch; \
731 for (; data != end; data++) { \
732 ch = *data; \
733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734 } \
735 break; \
736 } while (0)
737
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738 /* calculate simple bloom-style bitmask for a given unicode string */
739
Antoine Pitrouf068f942010-01-13 14:19:12 +0000740 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741
742 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200743 switch (kind) {
744 case PyUnicode_1BYTE_KIND:
745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746 break;
747 case PyUnicode_2BYTE_KIND:
748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749 break;
750 case PyUnicode_4BYTE_KIND:
751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752 break;
753 default:
754 assert(0);
755 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000756 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200757
758#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759}
760
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300761static int
762ensure_unicode(PyObject *obj)
763{
764 if (!PyUnicode_Check(obj)) {
765 PyErr_Format(PyExc_TypeError,
766 "must be str, not %.100s",
767 Py_TYPE(obj)->tp_name);
768 return -1;
769 }
770 return PyUnicode_READY(obj);
771}
772
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773/* Compilation of templated routines */
774
775#include "stringlib/asciilib.h"
776#include "stringlib/fastsearch.h"
777#include "stringlib/partition.h"
778#include "stringlib/split.h"
779#include "stringlib/count.h"
780#include "stringlib/find.h"
781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs1lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300791#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs2lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs4lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200818#include "stringlib/unicodedefs.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100822#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824/* --- Unicode Object ----------------------------------------------------- */
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200827fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200829Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200830 Py_ssize_t size, Py_UCS4 ch,
831 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200833 switch (kind) {
834 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200835 if ((Py_UCS1) ch != ch)
836 return -1;
837 if (direction > 0)
838 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
839 else
840 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200841 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200842 if ((Py_UCS2) ch != ch)
843 return -1;
844 if (direction > 0)
845 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
846 else
847 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200848 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200849 if (direction > 0)
850 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
851 else
852 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200853 default:
854 assert(0);
855 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200856 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200857}
858
Victor Stinnerafffce42012-10-03 23:03:17 +0200859#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000860/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200861 earlier.
862
863 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
864 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
865 invalid character in Unicode 6.0. */
866static void
867unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
868{
869 int kind = PyUnicode_KIND(unicode);
870 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
871 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
872 if (length <= old_length)
873 return;
874 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
875}
876#endif
877
Victor Stinnerfe226c02011-10-03 03:52:20 +0200878static PyObject*
879resize_compact(PyObject *unicode, Py_ssize_t length)
880{
881 Py_ssize_t char_size;
882 Py_ssize_t struct_size;
883 Py_ssize_t new_size;
884 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100885 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200886#ifdef Py_DEBUG
887 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
888#endif
889
Victor Stinner79891572012-05-03 13:43:07 +0200890 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200891 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100892 assert(PyUnicode_IS_COMPACT(unicode));
893
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200894 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100895 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200896 struct_size = sizeof(PyASCIIObject);
897 else
898 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200899 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200900
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
902 PyErr_NoMemory();
903 return NULL;
904 }
905 new_size = (struct_size + (length + 1) * char_size);
906
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200907 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
908 PyObject_DEL(_PyUnicode_UTF8(unicode));
909 _PyUnicode_UTF8(unicode) = NULL;
910 _PyUnicode_UTF8_LENGTH(unicode) = 0;
911 }
Victor Stinner84def372011-12-11 20:04:56 +0100912 _Py_DEC_REFTOTAL;
913 _Py_ForgetReference(unicode);
914
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300915 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100916 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100917 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918 PyErr_NoMemory();
919 return NULL;
920 }
Victor Stinner84def372011-12-11 20:04:56 +0100921 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200922 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100923
Victor Stinnerfe226c02011-10-03 03:52:20 +0200924 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200925 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200926 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100927 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200928 _PyUnicode_WSTR_LENGTH(unicode) = length;
929 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100930 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
931 PyObject_DEL(_PyUnicode_WSTR(unicode));
932 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100933 if (!PyUnicode_IS_ASCII(unicode))
934 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100935 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200936#ifdef Py_DEBUG
937 unicode_fill_invalid(unicode, old_length);
938#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
940 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200941 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200942 return unicode;
943}
944
Alexander Belopolsky40018472011-02-26 01:02:56 +0000945static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200946resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947{
Victor Stinner95663112011-10-04 01:03:50 +0200948 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100949 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200950 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200951 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000952
Victor Stinnerfe226c02011-10-03 03:52:20 +0200953 if (PyUnicode_IS_READY(unicode)) {
954 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200955 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200956 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200957#ifdef Py_DEBUG
958 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
959#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200960
961 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200962 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200963 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
964 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200965
966 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
967 PyErr_NoMemory();
968 return -1;
969 }
970 new_size = (length + 1) * char_size;
971
Victor Stinner7a9105a2011-12-12 00:13:42 +0100972 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
973 {
974 PyObject_DEL(_PyUnicode_UTF8(unicode));
975 _PyUnicode_UTF8(unicode) = NULL;
976 _PyUnicode_UTF8_LENGTH(unicode) = 0;
977 }
978
Victor Stinnerfe226c02011-10-03 03:52:20 +0200979 data = (PyObject *)PyObject_REALLOC(data, new_size);
980 if (data == NULL) {
981 PyErr_NoMemory();
982 return -1;
983 }
984 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200985 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200986 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200987 _PyUnicode_WSTR_LENGTH(unicode) = length;
988 }
989 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200990 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200991 _PyUnicode_UTF8_LENGTH(unicode) = length;
992 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200993 _PyUnicode_LENGTH(unicode) = length;
994 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200995#ifdef Py_DEBUG
996 unicode_fill_invalid(unicode, old_length);
997#endif
Victor Stinner95663112011-10-04 01:03:50 +0200998 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200999 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001000 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 }
Victor Stinner95663112011-10-04 01:03:50 +02001003 assert(_PyUnicode_WSTR(unicode) != NULL);
1004
1005 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001006 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001007 PyErr_NoMemory();
1008 return -1;
1009 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001010 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001011 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001012 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001013 if (!wstr) {
1014 PyErr_NoMemory();
1015 return -1;
1016 }
1017 _PyUnicode_WSTR(unicode) = wstr;
1018 _PyUnicode_WSTR(unicode)[length] = 0;
1019 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001020 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001021 return 0;
1022}
1023
Victor Stinnerfe226c02011-10-03 03:52:20 +02001024static PyObject*
1025resize_copy(PyObject *unicode, Py_ssize_t length)
1026{
1027 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001028 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001029 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001030
Benjamin Petersonbac79492012-01-14 13:34:47 -05001031 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001032 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001033
1034 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1035 if (copy == NULL)
1036 return NULL;
1037
1038 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001039 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001041 }
1042 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001043 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001044
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001045 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 if (w == NULL)
1047 return NULL;
1048 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1049 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001050 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1051 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001052 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 }
1054}
1055
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001057 Ux0000 terminated; some code (e.g. new_identifier)
1058 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059
1060 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001061 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001062
1063*/
1064
Alexander Belopolsky40018472011-02-26 01:02:56 +00001065static PyUnicodeObject *
1066_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001068 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001070
Thomas Wouters477c8d52006-05-27 19:21:47 +00001071 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072 if (length == 0 && unicode_empty != NULL) {
1073 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001074 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075 }
1076
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001077 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001078 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001079 return (PyUnicodeObject *)PyErr_NoMemory();
1080 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001081 if (length < 0) {
1082 PyErr_SetString(PyExc_SystemError,
1083 "Negative size passed to _PyUnicode_New");
1084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001085 }
1086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001087 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1088 if (unicode == NULL)
1089 return NULL;
1090 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001091
1092 _PyUnicode_WSTR_LENGTH(unicode) = length;
1093 _PyUnicode_HASH(unicode) = -1;
1094 _PyUnicode_STATE(unicode).interned = 0;
1095 _PyUnicode_STATE(unicode).kind = 0;
1096 _PyUnicode_STATE(unicode).compact = 0;
1097 _PyUnicode_STATE(unicode).ready = 0;
1098 _PyUnicode_STATE(unicode).ascii = 0;
1099 _PyUnicode_DATA_ANY(unicode) = NULL;
1100 _PyUnicode_LENGTH(unicode) = 0;
1101 _PyUnicode_UTF8(unicode) = NULL;
1102 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1105 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001106 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001107 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001108 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001109 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110
Jeremy Hyltond8082792003-09-16 19:41:39 +00001111 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001112 * the caller fails before initializing str -- unicode_resize()
1113 * reads str[0], and the Keep-Alive optimization can keep memory
1114 * allocated for str alive across a call to unicode_dealloc(unicode).
1115 * We don't want unicode_resize to read uninitialized memory in
1116 * that case.
1117 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 _PyUnicode_WSTR(unicode)[0] = 0;
1119 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001120
Victor Stinner7931d9a2011-11-04 00:22:48 +01001121 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001122 return unicode;
1123}
1124
Victor Stinnerf42dc442011-10-02 23:33:16 +02001125static const char*
1126unicode_kind_name(PyObject *unicode)
1127{
Victor Stinner42dfd712011-10-03 14:41:45 +02001128 /* don't check consistency: unicode_kind_name() is called from
1129 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001130 if (!PyUnicode_IS_COMPACT(unicode))
1131 {
1132 if (!PyUnicode_IS_READY(unicode))
1133 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001134 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001135 {
1136 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001137 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001138 return "legacy ascii";
1139 else
1140 return "legacy latin1";
1141 case PyUnicode_2BYTE_KIND:
1142 return "legacy UCS2";
1143 case PyUnicode_4BYTE_KIND:
1144 return "legacy UCS4";
1145 default:
1146 return "<legacy invalid kind>";
1147 }
1148 }
1149 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001150 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001152 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001153 return "ascii";
1154 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001155 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001156 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001157 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001158 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001159 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001160 default:
1161 return "<invalid compact kind>";
1162 }
1163}
1164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166/* Functions wrapping macros for use in debugger */
1167char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001168 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169}
1170
1171void *_PyUnicode_compact_data(void *unicode) {
1172 return _PyUnicode_COMPACT_DATA(unicode);
1173}
1174void *_PyUnicode_data(void *unicode){
1175 printf("obj %p\n", unicode);
1176 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1177 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1178 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1179 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1180 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1181 return PyUnicode_DATA(unicode);
1182}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001183
1184void
1185_PyUnicode_Dump(PyObject *op)
1186{
1187 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001188 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1189 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1190 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001191
Victor Stinnera849a4b2011-10-03 12:12:11 +02001192 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001193 {
1194 if (ascii->state.ascii)
1195 data = (ascii + 1);
1196 else
1197 data = (compact + 1);
1198 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001199 else
1200 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001201 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1202 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001203
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 if (ascii->wstr == data)
1205 printf("shared ");
1206 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001207
Victor Stinnera3b334d2011-10-03 13:53:37 +02001208 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001209 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001210 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1211 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001212 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1213 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001214 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001216}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217#endif
1218
1219PyObject *
1220PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1221{
1222 PyObject *obj;
1223 PyCompactUnicodeObject *unicode;
1224 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001225 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001226 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001227 Py_ssize_t char_size;
1228 Py_ssize_t struct_size;
1229
1230 /* Optimization for empty strings */
1231 if (size == 0 && unicode_empty != NULL) {
1232 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001233 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234 }
1235
Victor Stinner9e9d6892011-10-04 01:02:02 +02001236 is_ascii = 0;
1237 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 struct_size = sizeof(PyCompactUnicodeObject);
1239 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001240 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 char_size = 1;
1242 is_ascii = 1;
1243 struct_size = sizeof(PyASCIIObject);
1244 }
1245 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001246 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247 char_size = 1;
1248 }
1249 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001250 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 char_size = 2;
1252 if (sizeof(wchar_t) == 2)
1253 is_sharing = 1;
1254 }
1255 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001256 if (maxchar > MAX_UNICODE) {
1257 PyErr_SetString(PyExc_SystemError,
1258 "invalid maximum character passed to PyUnicode_New");
1259 return NULL;
1260 }
Victor Stinner8f825062012-04-27 13:55:39 +02001261 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262 char_size = 4;
1263 if (sizeof(wchar_t) == 4)
1264 is_sharing = 1;
1265 }
1266
1267 /* Ensure we won't overflow the size. */
1268 if (size < 0) {
1269 PyErr_SetString(PyExc_SystemError,
1270 "Negative size passed to PyUnicode_New");
1271 return NULL;
1272 }
1273 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1274 return PyErr_NoMemory();
1275
1276 /* Duplicated allocation code from _PyObject_New() instead of a call to
1277 * PyObject_New() so we are able to allocate space for the object and
1278 * it's data buffer.
1279 */
1280 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1281 if (obj == NULL)
1282 return PyErr_NoMemory();
1283 obj = PyObject_INIT(obj, &PyUnicode_Type);
1284 if (obj == NULL)
1285 return NULL;
1286
1287 unicode = (PyCompactUnicodeObject *)obj;
1288 if (is_ascii)
1289 data = ((PyASCIIObject*)obj) + 1;
1290 else
1291 data = unicode + 1;
1292 _PyUnicode_LENGTH(unicode) = size;
1293 _PyUnicode_HASH(unicode) = -1;
1294 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001295 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296 _PyUnicode_STATE(unicode).compact = 1;
1297 _PyUnicode_STATE(unicode).ready = 1;
1298 _PyUnicode_STATE(unicode).ascii = is_ascii;
1299 if (is_ascii) {
1300 ((char*)data)[size] = 0;
1301 _PyUnicode_WSTR(unicode) = NULL;
1302 }
Victor Stinner8f825062012-04-27 13:55:39 +02001303 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304 ((char*)data)[size] = 0;
1305 _PyUnicode_WSTR(unicode) = NULL;
1306 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001308 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 else {
1311 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001312 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001313 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001315 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 ((Py_UCS4*)data)[size] = 0;
1317 if (is_sharing) {
1318 _PyUnicode_WSTR_LENGTH(unicode) = size;
1319 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1320 }
1321 else {
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1323 _PyUnicode_WSTR(unicode) = NULL;
1324 }
1325 }
Victor Stinner8f825062012-04-27 13:55:39 +02001326#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001327 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001328#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001329 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 return obj;
1331}
1332
1333#if SIZEOF_WCHAR_T == 2
1334/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1335 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001336 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337
1338 This function assumes that unicode can hold one more code point than wstr
1339 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001340static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001342 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343{
1344 const wchar_t *iter;
1345 Py_UCS4 *ucs4_out;
1346
Victor Stinner910337b2011-10-03 03:20:16 +02001347 assert(unicode != NULL);
1348 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1350 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1351
1352 for (iter = begin; iter < end; ) {
1353 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1354 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001355 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1356 && (iter+1) < end
1357 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358 {
Victor Stinner551ac952011-11-29 22:58:13 +01001359 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360 iter += 2;
1361 }
1362 else {
1363 *ucs4_out++ = *iter;
1364 iter++;
1365 }
1366 }
1367 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1368 _PyUnicode_GET_LENGTH(unicode)));
1369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370}
1371#endif
1372
Victor Stinnercd9950f2011-10-02 00:34:53 +02001373static int
Victor Stinner488fa492011-12-12 00:01:39 +01001374unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001375{
Victor Stinner488fa492011-12-12 00:01:39 +01001376 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001377 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001378 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001379 return -1;
1380 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001381 return 0;
1382}
1383
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001384static int
1385_copy_characters(PyObject *to, Py_ssize_t to_start,
1386 PyObject *from, Py_ssize_t from_start,
1387 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001389 unsigned int from_kind, to_kind;
1390 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391
Victor Stinneree4544c2012-05-09 22:24:08 +02001392 assert(0 <= how_many);
1393 assert(0 <= from_start);
1394 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001395 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001397 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398
Victor Stinnerd3f08822012-05-29 12:57:52 +02001399 assert(PyUnicode_Check(to));
1400 assert(PyUnicode_IS_READY(to));
1401 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1402
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001403 if (how_many == 0)
1404 return 0;
1405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001407 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001409 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410
Victor Stinnerf1852262012-06-16 16:38:26 +02001411#ifdef Py_DEBUG
1412 if (!check_maxchar
1413 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1414 {
1415 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1416 Py_UCS4 ch;
1417 Py_ssize_t i;
1418 for (i=0; i < how_many; i++) {
1419 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1420 assert(ch <= to_maxchar);
1421 }
1422 }
1423#endif
1424
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001425 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001426 if (check_maxchar
1427 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1428 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001429 /* Writing Latin-1 characters into an ASCII string requires to
1430 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001431 Py_UCS4 max_char;
1432 max_char = ucs1lib_find_max_char(from_data,
1433 (Py_UCS1*)from_data + how_many);
1434 if (max_char >= 128)
1435 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001436 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001437 Py_MEMCPY((char*)to_data + to_kind * to_start,
1438 (char*)from_data + from_kind * from_start,
1439 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001441 else if (from_kind == PyUnicode_1BYTE_KIND
1442 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001443 {
1444 _PyUnicode_CONVERT_BYTES(
1445 Py_UCS1, Py_UCS2,
1446 PyUnicode_1BYTE_DATA(from) + from_start,
1447 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1448 PyUnicode_2BYTE_DATA(to) + to_start
1449 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001450 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001451 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001452 && to_kind == PyUnicode_4BYTE_KIND)
1453 {
1454 _PyUnicode_CONVERT_BYTES(
1455 Py_UCS1, Py_UCS4,
1456 PyUnicode_1BYTE_DATA(from) + from_start,
1457 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1458 PyUnicode_4BYTE_DATA(to) + to_start
1459 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001460 }
1461 else if (from_kind == PyUnicode_2BYTE_KIND
1462 && to_kind == PyUnicode_4BYTE_KIND)
1463 {
1464 _PyUnicode_CONVERT_BYTES(
1465 Py_UCS2, Py_UCS4,
1466 PyUnicode_2BYTE_DATA(from) + from_start,
1467 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1468 PyUnicode_4BYTE_DATA(to) + to_start
1469 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001470 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001471 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001472 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1473
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001474 if (!check_maxchar) {
1475 if (from_kind == PyUnicode_2BYTE_KIND
1476 && to_kind == PyUnicode_1BYTE_KIND)
1477 {
1478 _PyUnicode_CONVERT_BYTES(
1479 Py_UCS2, Py_UCS1,
1480 PyUnicode_2BYTE_DATA(from) + from_start,
1481 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1482 PyUnicode_1BYTE_DATA(to) + to_start
1483 );
1484 }
1485 else if (from_kind == PyUnicode_4BYTE_KIND
1486 && to_kind == PyUnicode_1BYTE_KIND)
1487 {
1488 _PyUnicode_CONVERT_BYTES(
1489 Py_UCS4, Py_UCS1,
1490 PyUnicode_4BYTE_DATA(from) + from_start,
1491 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1492 PyUnicode_1BYTE_DATA(to) + to_start
1493 );
1494 }
1495 else if (from_kind == PyUnicode_4BYTE_KIND
1496 && to_kind == PyUnicode_2BYTE_KIND)
1497 {
1498 _PyUnicode_CONVERT_BYTES(
1499 Py_UCS4, Py_UCS2,
1500 PyUnicode_4BYTE_DATA(from) + from_start,
1501 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1502 PyUnicode_2BYTE_DATA(to) + to_start
1503 );
1504 }
1505 else {
1506 assert(0);
1507 return -1;
1508 }
1509 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001510 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001511 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001512 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001513 Py_ssize_t i;
1514
Victor Stinnera0702ab2011-09-29 14:14:38 +02001515 for (i=0; i < how_many; i++) {
1516 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001517 if (ch > to_maxchar)
1518 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001519 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1520 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001521 }
1522 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001523 return 0;
1524}
1525
Victor Stinnerd3f08822012-05-29 12:57:52 +02001526void
1527_PyUnicode_FastCopyCharacters(
1528 PyObject *to, Py_ssize_t to_start,
1529 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001530{
1531 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1532}
1533
1534Py_ssize_t
1535PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1536 PyObject *from, Py_ssize_t from_start,
1537 Py_ssize_t how_many)
1538{
1539 int err;
1540
1541 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1542 PyErr_BadInternalCall();
1543 return -1;
1544 }
1545
Benjamin Petersonbac79492012-01-14 13:34:47 -05001546 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001547 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001548 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001549 return -1;
1550
Victor Stinnerd3f08822012-05-29 12:57:52 +02001551 if (from_start < 0) {
1552 PyErr_SetString(PyExc_IndexError, "string index out of range");
1553 return -1;
1554 }
1555 if (to_start < 0) {
1556 PyErr_SetString(PyExc_IndexError, "string index out of range");
1557 return -1;
1558 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001559 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1560 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1561 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001562 "Cannot write %zi characters at %zi "
1563 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001564 how_many, to_start, PyUnicode_GET_LENGTH(to));
1565 return -1;
1566 }
1567
1568 if (how_many == 0)
1569 return 0;
1570
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001572 return -1;
1573
1574 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1575 if (err) {
1576 PyErr_Format(PyExc_SystemError,
1577 "Cannot copy %s characters "
1578 "into a string of %s characters",
1579 unicode_kind_name(from),
1580 unicode_kind_name(to));
1581 return -1;
1582 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001583 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001584}
1585
Victor Stinner17222162011-09-28 22:15:37 +02001586/* Find the maximum code point and count the number of surrogate pairs so a
1587 correct string length can be computed before converting a string to UCS4.
1588 This function counts single surrogates as a character and not as a pair.
1589
1590 Return 0 on success, or -1 on error. */
1591static int
1592find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1593 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594{
1595 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001596 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597
Victor Stinnerc53be962011-10-02 21:33:54 +02001598 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599 *num_surrogates = 0;
1600 *maxchar = 0;
1601
1602 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001604 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1605 && (iter+1) < end
1606 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1607 {
1608 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1609 ++(*num_surrogates);
1610 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611 }
1612 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001613#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001614 {
1615 ch = *iter;
1616 iter++;
1617 }
1618 if (ch > *maxchar) {
1619 *maxchar = ch;
1620 if (*maxchar > MAX_UNICODE) {
1621 PyErr_Format(PyExc_ValueError,
1622 "character U+%x is not in range [U+0000; U+10ffff]",
1623 ch);
1624 return -1;
1625 }
1626 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001627 }
1628 return 0;
1629}
1630
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001631int
1632_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633{
1634 wchar_t *end;
1635 Py_UCS4 maxchar = 0;
1636 Py_ssize_t num_surrogates;
1637#if SIZEOF_WCHAR_T == 2
1638 Py_ssize_t length_wo_surrogates;
1639#endif
1640
Georg Brandl7597add2011-10-05 16:36:47 +02001641 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001642 strings were created using _PyObject_New() and where no canonical
1643 representation (the str field) has been set yet aka strings
1644 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001645 assert(_PyUnicode_CHECK(unicode));
1646 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001648 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001649 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001650 /* Actually, it should neither be interned nor be anything else: */
1651 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001654 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001655 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657
1658 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001659 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1660 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 PyErr_NoMemory();
1662 return -1;
1663 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001664 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001665 _PyUnicode_WSTR(unicode), end,
1666 PyUnicode_1BYTE_DATA(unicode));
1667 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1668 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1669 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1670 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001671 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001672 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001673 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001674 }
1675 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001676 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001677 _PyUnicode_UTF8(unicode) = NULL;
1678 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 }
1680 PyObject_FREE(_PyUnicode_WSTR(unicode));
1681 _PyUnicode_WSTR(unicode) = NULL;
1682 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1683 }
1684 /* In this case we might have to convert down from 4-byte native
1685 wchar_t to 2-byte unicode. */
1686 else if (maxchar < 65536) {
1687 assert(num_surrogates == 0 &&
1688 "FindMaxCharAndNumSurrogatePairs() messed up");
1689
Victor Stinner506f5922011-09-28 22:34:18 +02001690#if SIZEOF_WCHAR_T == 2
1691 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001693 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1694 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1695 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001696 _PyUnicode_UTF8(unicode) = NULL;
1697 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001698#else
1699 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001700 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001701 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001702 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001703 PyErr_NoMemory();
1704 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705 }
Victor Stinner506f5922011-09-28 22:34:18 +02001706 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1707 _PyUnicode_WSTR(unicode), end,
1708 PyUnicode_2BYTE_DATA(unicode));
1709 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1710 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1711 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001712 _PyUnicode_UTF8(unicode) = NULL;
1713 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001714 PyObject_FREE(_PyUnicode_WSTR(unicode));
1715 _PyUnicode_WSTR(unicode) = NULL;
1716 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1717#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 }
1719 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1720 else {
1721#if SIZEOF_WCHAR_T == 2
1722 /* in case the native representation is 2-bytes, we need to allocate a
1723 new normalized 4-byte version. */
1724 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001725 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1726 PyErr_NoMemory();
1727 return -1;
1728 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001729 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1730 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 PyErr_NoMemory();
1732 return -1;
1733 }
1734 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1735 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001736 _PyUnicode_UTF8(unicode) = NULL;
1737 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001738 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1739 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001740 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 PyObject_FREE(_PyUnicode_WSTR(unicode));
1742 _PyUnicode_WSTR(unicode) = NULL;
1743 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1744#else
1745 assert(num_surrogates == 0);
1746
Victor Stinnerc3c74152011-10-02 20:39:55 +02001747 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001749 _PyUnicode_UTF8(unicode) = NULL;
1750 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1752#endif
1753 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1754 }
1755 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001756 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 return 0;
1758}
1759
Alexander Belopolsky40018472011-02-26 01:02:56 +00001760static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001761unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762{
Walter Dörwald16807132007-05-25 13:52:07 +00001763 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001764 case SSTATE_NOT_INTERNED:
1765 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001766
Benjamin Peterson29060642009-01-31 22:14:21 +00001767 case SSTATE_INTERNED_MORTAL:
1768 /* revive dead object temporarily for DelItem */
1769 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001770 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001771 Py_FatalError(
1772 "deletion of interned string failed");
1773 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001774
Benjamin Peterson29060642009-01-31 22:14:21 +00001775 case SSTATE_INTERNED_IMMORTAL:
1776 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001777
Benjamin Peterson29060642009-01-31 22:14:21 +00001778 default:
1779 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001780 }
1781
Victor Stinner03490912011-10-03 23:45:12 +02001782 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001784 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001785 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001786 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1787 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001789 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790}
1791
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001792#ifdef Py_DEBUG
1793static int
1794unicode_is_singleton(PyObject *unicode)
1795{
1796 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1797 if (unicode == unicode_empty)
1798 return 1;
1799 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1800 {
1801 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1802 if (ch < 256 && unicode_latin1[ch] == unicode)
1803 return 1;
1804 }
1805 return 0;
1806}
1807#endif
1808
Alexander Belopolsky40018472011-02-26 01:02:56 +00001809static int
Victor Stinner488fa492011-12-12 00:01:39 +01001810unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001811{
Victor Stinner488fa492011-12-12 00:01:39 +01001812 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001813 if (Py_REFCNT(unicode) != 1)
1814 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001815 if (_PyUnicode_HASH(unicode) != -1)
1816 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001817 if (PyUnicode_CHECK_INTERNED(unicode))
1818 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001819 if (!PyUnicode_CheckExact(unicode))
1820 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001821#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001822 /* singleton refcount is greater than 1 */
1823 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001824#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001825 return 1;
1826}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001827
Victor Stinnerfe226c02011-10-03 03:52:20 +02001828static int
1829unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1830{
1831 PyObject *unicode;
1832 Py_ssize_t old_length;
1833
1834 assert(p_unicode != NULL);
1835 unicode = *p_unicode;
1836
1837 assert(unicode != NULL);
1838 assert(PyUnicode_Check(unicode));
1839 assert(0 <= length);
1840
Victor Stinner910337b2011-10-03 03:20:16 +02001841 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001842 old_length = PyUnicode_WSTR_LENGTH(unicode);
1843 else
1844 old_length = PyUnicode_GET_LENGTH(unicode);
1845 if (old_length == length)
1846 return 0;
1847
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001848 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001849 _Py_INCREF_UNICODE_EMPTY();
1850 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001851 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001852 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001853 return 0;
1854 }
1855
Victor Stinner488fa492011-12-12 00:01:39 +01001856 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001857 PyObject *copy = resize_copy(unicode, length);
1858 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001859 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001860 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001861 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001862 }
1863
Victor Stinnerfe226c02011-10-03 03:52:20 +02001864 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001865 PyObject *new_unicode = resize_compact(unicode, length);
1866 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001867 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001868 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001869 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001870 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001871 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001872}
1873
Alexander Belopolsky40018472011-02-26 01:02:56 +00001874int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001875PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001876{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001877 PyObject *unicode;
1878 if (p_unicode == NULL) {
1879 PyErr_BadInternalCall();
1880 return -1;
1881 }
1882 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001883 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001884 {
1885 PyErr_BadInternalCall();
1886 return -1;
1887 }
1888 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001889}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001890
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001891/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001892
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001893 WARNING: The function doesn't copy the terminating null character and
1894 doesn't check the maximum character (may write a latin1 character in an
1895 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001896static void
1897unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1898 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001899{
1900 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1901 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001902 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001903
1904 switch (kind) {
1905 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001906 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001907#ifdef Py_DEBUG
1908 if (PyUnicode_IS_ASCII(unicode)) {
1909 Py_UCS4 maxchar = ucs1lib_find_max_char(
1910 (const Py_UCS1*)str,
1911 (const Py_UCS1*)str + len);
1912 assert(maxchar < 128);
1913 }
1914#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001915 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001916 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001917 }
1918 case PyUnicode_2BYTE_KIND: {
1919 Py_UCS2 *start = (Py_UCS2 *)data + index;
1920 Py_UCS2 *ucs2 = start;
1921 assert(index <= PyUnicode_GET_LENGTH(unicode));
1922
Victor Stinner184252a2012-06-16 02:57:41 +02001923 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001924 *ucs2 = (Py_UCS2)*str;
1925
1926 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001927 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001928 }
1929 default: {
1930 Py_UCS4 *start = (Py_UCS4 *)data + index;
1931 Py_UCS4 *ucs4 = start;
1932 assert(kind == PyUnicode_4BYTE_KIND);
1933 assert(index <= PyUnicode_GET_LENGTH(unicode));
1934
Victor Stinner184252a2012-06-16 02:57:41 +02001935 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001936 *ucs4 = (Py_UCS4)*str;
1937
1938 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001939 }
1940 }
1941}
1942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943static PyObject*
1944get_latin1_char(unsigned char ch)
1945{
Victor Stinnera464fc12011-10-02 20:39:30 +02001946 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001948 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001949 if (!unicode)
1950 return NULL;
1951 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001952 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953 unicode_latin1[ch] = unicode;
1954 }
1955 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001956 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957}
1958
Victor Stinner985a82a2014-01-03 12:53:47 +01001959static PyObject*
1960unicode_char(Py_UCS4 ch)
1961{
1962 PyObject *unicode;
1963
1964 assert(ch <= MAX_UNICODE);
1965
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001966 if (ch < 256)
1967 return get_latin1_char(ch);
1968
Victor Stinner985a82a2014-01-03 12:53:47 +01001969 unicode = PyUnicode_New(1, ch);
1970 if (unicode == NULL)
1971 return NULL;
1972 switch (PyUnicode_KIND(unicode)) {
1973 case PyUnicode_1BYTE_KIND:
1974 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1975 break;
1976 case PyUnicode_2BYTE_KIND:
1977 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1978 break;
1979 default:
1980 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1981 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1982 }
1983 assert(_PyUnicode_CheckConsistency(unicode, 1));
1984 return unicode;
1985}
1986
Alexander Belopolsky40018472011-02-26 01:02:56 +00001987PyObject *
1988PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001990 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001991 Py_UCS4 maxchar = 0;
1992 Py_ssize_t num_surrogates;
1993
1994 if (u == NULL)
1995 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001997 /* If the Unicode data is known at construction time, we can apply
1998 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002001 if (size == 0)
2002 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 /* Single character Unicode objects in the Latin-1 range are
2005 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002006 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 return get_latin1_char((unsigned char)*u);
2008
2009 /* If not empty and not single character, copy the Unicode data
2010 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002011 if (find_maxchar_surrogates(u, u + size,
2012 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 return NULL;
2014
Victor Stinner8faf8212011-12-08 22:14:11 +01002015 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002016 if (!unicode)
2017 return NULL;
2018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 switch (PyUnicode_KIND(unicode)) {
2020 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002021 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2023 break;
2024 case PyUnicode_2BYTE_KIND:
2025#if Py_UNICODE_SIZE == 2
2026 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2027#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002028 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002029 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2030#endif
2031 break;
2032 case PyUnicode_4BYTE_KIND:
2033#if SIZEOF_WCHAR_T == 2
2034 /* This is the only case which has to process surrogates, thus
2035 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002036 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037#else
2038 assert(num_surrogates == 0);
2039 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2040#endif
2041 break;
2042 default:
2043 assert(0 && "Impossible state");
2044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002046 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047}
2048
Alexander Belopolsky40018472011-02-26 01:02:56 +00002049PyObject *
2050PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002051{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002052 if (size < 0) {
2053 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002054 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002055 return NULL;
2056 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002057 if (u != NULL)
2058 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2059 else
2060 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002061}
2062
Alexander Belopolsky40018472011-02-26 01:02:56 +00002063PyObject *
2064PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002065{
2066 size_t size = strlen(u);
2067 if (size > PY_SSIZE_T_MAX) {
2068 PyErr_SetString(PyExc_OverflowError, "input too long");
2069 return NULL;
2070 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002071 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002072}
2073
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002074PyObject *
2075_PyUnicode_FromId(_Py_Identifier *id)
2076{
2077 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002078 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2079 strlen(id->string),
2080 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002081 if (!id->object)
2082 return NULL;
2083 PyUnicode_InternInPlace(&id->object);
2084 assert(!id->next);
2085 id->next = static_strings;
2086 static_strings = id;
2087 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002088 return id->object;
2089}
2090
2091void
2092_PyUnicode_ClearStaticStrings()
2093{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002094 _Py_Identifier *tmp, *s = static_strings;
2095 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002096 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002097 tmp = s->next;
2098 s->next = NULL;
2099 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002100 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002101 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002102}
2103
Benjamin Peterson0df54292012-03-26 14:50:32 -04002104/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002105
Victor Stinnerd3f08822012-05-29 12:57:52 +02002106PyObject*
2107_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002108{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002109 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002110 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002111 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002112#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002113 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002114#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002115 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002116 }
Victor Stinner785938e2011-12-11 20:09:03 +01002117 unicode = PyUnicode_New(size, 127);
2118 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002119 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002120 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2121 assert(_PyUnicode_CheckConsistency(unicode, 1));
2122 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002123}
2124
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002125static Py_UCS4
2126kind_maxchar_limit(unsigned int kind)
2127{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002128 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002129 case PyUnicode_1BYTE_KIND:
2130 return 0x80;
2131 case PyUnicode_2BYTE_KIND:
2132 return 0x100;
2133 case PyUnicode_4BYTE_KIND:
2134 return 0x10000;
2135 default:
2136 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002137 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002138 }
2139}
2140
Victor Stinnere6abb482012-05-02 01:15:40 +02002141Py_LOCAL_INLINE(Py_UCS4)
2142align_maxchar(Py_UCS4 maxchar)
2143{
2144 if (maxchar <= 127)
2145 return 127;
2146 else if (maxchar <= 255)
2147 return 255;
2148 else if (maxchar <= 65535)
2149 return 65535;
2150 else
2151 return MAX_UNICODE;
2152}
2153
Victor Stinner702c7342011-10-05 13:50:52 +02002154static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002155_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002156{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002158 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002159
Serhiy Storchaka678db842013-01-26 12:16:36 +02002160 if (size == 0)
2161 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002162 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002163 if (size == 1)
2164 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002165
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002166 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002167 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 if (!res)
2169 return NULL;
2170 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002171 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002173}
2174
Victor Stinnere57b1c02011-09-28 22:20:48 +02002175static PyObject*
2176_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177{
2178 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002179 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002180
Serhiy Storchaka678db842013-01-26 12:16:36 +02002181 if (size == 0)
2182 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002183 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002184 if (size == 1)
2185 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002186
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002187 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002188 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189 if (!res)
2190 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002191 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002193 else {
2194 _PyUnicode_CONVERT_BYTES(
2195 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2196 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002197 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 return res;
2199}
2200
Victor Stinnere57b1c02011-09-28 22:20:48 +02002201static PyObject*
2202_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203{
2204 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002205 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002206
Serhiy Storchaka678db842013-01-26 12:16:36 +02002207 if (size == 0)
2208 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002209 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002210 if (size == 1)
2211 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002212
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002213 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002214 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 if (!res)
2216 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002217 if (max_char < 256)
2218 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2219 PyUnicode_1BYTE_DATA(res));
2220 else if (max_char < 0x10000)
2221 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2222 PyUnicode_2BYTE_DATA(res));
2223 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002225 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 return res;
2227}
2228
2229PyObject*
2230PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2231{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002232 if (size < 0) {
2233 PyErr_SetString(PyExc_ValueError, "size must be positive");
2234 return NULL;
2235 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002236 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002238 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002240 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002242 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002243 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002244 PyErr_SetString(PyExc_SystemError, "invalid kind");
2245 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247}
2248
Victor Stinnerece58de2012-04-23 23:36:38 +02002249Py_UCS4
2250_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2251{
2252 enum PyUnicode_Kind kind;
2253 void *startptr, *endptr;
2254
2255 assert(PyUnicode_IS_READY(unicode));
2256 assert(0 <= start);
2257 assert(end <= PyUnicode_GET_LENGTH(unicode));
2258 assert(start <= end);
2259
2260 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2261 return PyUnicode_MAX_CHAR_VALUE(unicode);
2262
2263 if (start == end)
2264 return 127;
2265
Victor Stinner94d558b2012-04-27 22:26:58 +02002266 if (PyUnicode_IS_ASCII(unicode))
2267 return 127;
2268
Victor Stinnerece58de2012-04-23 23:36:38 +02002269 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002270 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002271 endptr = (char *)startptr + end * kind;
2272 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002273 switch(kind) {
2274 case PyUnicode_1BYTE_KIND:
2275 return ucs1lib_find_max_char(startptr, endptr);
2276 case PyUnicode_2BYTE_KIND:
2277 return ucs2lib_find_max_char(startptr, endptr);
2278 case PyUnicode_4BYTE_KIND:
2279 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002280 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002281 assert(0);
2282 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002283 }
2284}
2285
Victor Stinner25a4b292011-10-06 12:31:55 +02002286/* Ensure that a string uses the most efficient storage, if it is not the
2287 case: create a new string with of the right kind. Write NULL into *p_unicode
2288 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002289static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002290unicode_adjust_maxchar(PyObject **p_unicode)
2291{
2292 PyObject *unicode, *copy;
2293 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002294 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002295 unsigned int kind;
2296
2297 assert(p_unicode != NULL);
2298 unicode = *p_unicode;
2299 assert(PyUnicode_IS_READY(unicode));
2300 if (PyUnicode_IS_ASCII(unicode))
2301 return;
2302
2303 len = PyUnicode_GET_LENGTH(unicode);
2304 kind = PyUnicode_KIND(unicode);
2305 if (kind == PyUnicode_1BYTE_KIND) {
2306 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002307 max_char = ucs1lib_find_max_char(u, u + len);
2308 if (max_char >= 128)
2309 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002310 }
2311 else if (kind == PyUnicode_2BYTE_KIND) {
2312 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002313 max_char = ucs2lib_find_max_char(u, u + len);
2314 if (max_char >= 256)
2315 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002316 }
2317 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002318 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002319 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002320 max_char = ucs4lib_find_max_char(u, u + len);
2321 if (max_char >= 0x10000)
2322 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002323 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002324 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002325 if (copy != NULL)
2326 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002327 Py_DECREF(unicode);
2328 *p_unicode = copy;
2329}
2330
Victor Stinner034f6cf2011-09-30 02:26:44 +02002331PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002332_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002333{
Victor Stinner87af4f22011-11-21 23:03:47 +01002334 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002335 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002336
Victor Stinner034f6cf2011-09-30 02:26:44 +02002337 if (!PyUnicode_Check(unicode)) {
2338 PyErr_BadInternalCall();
2339 return NULL;
2340 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002341 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002342 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002343
Victor Stinner87af4f22011-11-21 23:03:47 +01002344 length = PyUnicode_GET_LENGTH(unicode);
2345 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002346 if (!copy)
2347 return NULL;
2348 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2349
Victor Stinner87af4f22011-11-21 23:03:47 +01002350 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2351 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002352 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002353 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002354}
2355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002356
Victor Stinnerbc603d12011-10-02 01:00:40 +02002357/* Widen Unicode objects to larger buffers. Don't write terminating null
2358 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002359
2360void*
2361_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2362{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002363 Py_ssize_t len;
2364 void *result;
2365 unsigned int skind;
2366
Benjamin Petersonbac79492012-01-14 13:34:47 -05002367 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002368 return NULL;
2369
2370 len = PyUnicode_GET_LENGTH(s);
2371 skind = PyUnicode_KIND(s);
2372 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002373 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002374 return NULL;
2375 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002376 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002377 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002378 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002379 if (!result)
2380 return PyErr_NoMemory();
2381 assert(skind == PyUnicode_1BYTE_KIND);
2382 _PyUnicode_CONVERT_BYTES(
2383 Py_UCS1, Py_UCS2,
2384 PyUnicode_1BYTE_DATA(s),
2385 PyUnicode_1BYTE_DATA(s) + len,
2386 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002388 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002389 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002390 if (!result)
2391 return PyErr_NoMemory();
2392 if (skind == PyUnicode_2BYTE_KIND) {
2393 _PyUnicode_CONVERT_BYTES(
2394 Py_UCS2, Py_UCS4,
2395 PyUnicode_2BYTE_DATA(s),
2396 PyUnicode_2BYTE_DATA(s) + len,
2397 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002399 else {
2400 assert(skind == PyUnicode_1BYTE_KIND);
2401 _PyUnicode_CONVERT_BYTES(
2402 Py_UCS1, Py_UCS4,
2403 PyUnicode_1BYTE_DATA(s),
2404 PyUnicode_1BYTE_DATA(s) + len,
2405 result);
2406 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002408 default:
2409 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002410 }
Victor Stinner01698042011-10-04 00:04:26 +02002411 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 return NULL;
2413}
2414
2415static Py_UCS4*
2416as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2417 int copy_null)
2418{
2419 int kind;
2420 void *data;
2421 Py_ssize_t len, targetlen;
2422 if (PyUnicode_READY(string) == -1)
2423 return NULL;
2424 kind = PyUnicode_KIND(string);
2425 data = PyUnicode_DATA(string);
2426 len = PyUnicode_GET_LENGTH(string);
2427 targetlen = len;
2428 if (copy_null)
2429 targetlen++;
2430 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002431 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 if (!target) {
2433 PyErr_NoMemory();
2434 return NULL;
2435 }
2436 }
2437 else {
2438 if (targetsize < targetlen) {
2439 PyErr_Format(PyExc_SystemError,
2440 "string is longer than the buffer");
2441 if (copy_null && 0 < targetsize)
2442 target[0] = 0;
2443 return NULL;
2444 }
2445 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002446 if (kind == PyUnicode_1BYTE_KIND) {
2447 Py_UCS1 *start = (Py_UCS1 *) data;
2448 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002450 else if (kind == PyUnicode_2BYTE_KIND) {
2451 Py_UCS2 *start = (Py_UCS2 *) data;
2452 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2453 }
2454 else {
2455 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 if (copy_null)
2459 target[len] = 0;
2460 return target;
2461}
2462
2463Py_UCS4*
2464PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2465 int copy_null)
2466{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002467 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 PyErr_BadInternalCall();
2469 return NULL;
2470 }
2471 return as_ucs4(string, target, targetsize, copy_null);
2472}
2473
2474Py_UCS4*
2475PyUnicode_AsUCS4Copy(PyObject *string)
2476{
2477 return as_ucs4(string, NULL, 0, 1);
2478}
2479
2480#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002481
Alexander Belopolsky40018472011-02-26 01:02:56 +00002482PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002483PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002487 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002488 PyErr_BadInternalCall();
2489 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 }
2491
Martin v. Löwis790465f2008-04-05 20:41:37 +00002492 if (size == -1) {
2493 size = wcslen(w);
2494 }
2495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497}
2498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002500
Victor Stinner15a11362012-10-06 23:48:20 +02002501/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002502 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2503 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2504#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002505
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002506static int
2507unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2508 Py_ssize_t width, Py_ssize_t precision)
2509{
2510 Py_ssize_t length, fill, arglen;
2511 Py_UCS4 maxchar;
2512
2513 if (PyUnicode_READY(str) == -1)
2514 return -1;
2515
2516 length = PyUnicode_GET_LENGTH(str);
2517 if ((precision == -1 || precision >= length)
2518 && width <= length)
2519 return _PyUnicodeWriter_WriteStr(writer, str);
2520
2521 if (precision != -1)
2522 length = Py_MIN(precision, length);
2523
2524 arglen = Py_MAX(length, width);
2525 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2526 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2527 else
2528 maxchar = writer->maxchar;
2529
2530 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2531 return -1;
2532
2533 if (width > length) {
2534 fill = width - length;
2535 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2536 return -1;
2537 writer->pos += fill;
2538 }
2539
2540 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2541 str, 0, length);
2542 writer->pos += length;
2543 return 0;
2544}
2545
2546static int
2547unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2548 Py_ssize_t width, Py_ssize_t precision)
2549{
2550 /* UTF-8 */
2551 Py_ssize_t length;
2552 PyObject *unicode;
2553 int res;
2554
2555 length = strlen(str);
2556 if (precision != -1)
2557 length = Py_MIN(length, precision);
2558 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2559 if (unicode == NULL)
2560 return -1;
2561
2562 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2563 Py_DECREF(unicode);
2564 return res;
2565}
2566
Victor Stinner96865452011-03-01 23:44:09 +00002567static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002568unicode_fromformat_arg(_PyUnicodeWriter *writer,
2569 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002570{
Victor Stinnere215d962012-10-06 23:03:36 +02002571 const char *p;
2572 Py_ssize_t len;
2573 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002574 Py_ssize_t width;
2575 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002576 int longflag;
2577 int longlongflag;
2578 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002579 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002580
2581 p = f;
2582 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002583 zeropad = 0;
2584 if (*f == '0') {
2585 zeropad = 1;
2586 f++;
2587 }
Victor Stinner96865452011-03-01 23:44:09 +00002588
2589 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 width = -1;
2591 if (Py_ISDIGIT((unsigned)*f)) {
2592 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002593 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002594 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002596 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002597 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002598 return NULL;
2599 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002600 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002601 f++;
2602 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002603 }
2604 precision = -1;
2605 if (*f == '.') {
2606 f++;
2607 if (Py_ISDIGIT((unsigned)*f)) {
2608 precision = (*f - '0');
2609 f++;
2610 while (Py_ISDIGIT((unsigned)*f)) {
2611 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2612 PyErr_SetString(PyExc_ValueError,
2613 "precision too big");
2614 return NULL;
2615 }
2616 precision = (precision * 10) + (*f - '0');
2617 f++;
2618 }
2619 }
Victor Stinner96865452011-03-01 23:44:09 +00002620 if (*f == '%') {
2621 /* "%.3%s" => f points to "3" */
2622 f--;
2623 }
2624 }
2625 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002626 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002627 f--;
2628 }
Victor Stinner96865452011-03-01 23:44:09 +00002629
2630 /* Handle %ld, %lu, %lld and %llu. */
2631 longflag = 0;
2632 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002633 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002634 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002635 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002636 longflag = 1;
2637 ++f;
2638 }
Victor Stinner96865452011-03-01 23:44:09 +00002639 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002640 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002641 longlongflag = 1;
2642 f += 2;
2643 }
Victor Stinner96865452011-03-01 23:44:09 +00002644 }
2645 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002646 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002647 size_tflag = 1;
2648 ++f;
2649 }
Victor Stinnere215d962012-10-06 23:03:36 +02002650
2651 if (f[1] == '\0')
2652 writer->overallocate = 0;
2653
2654 switch (*f) {
2655 case 'c':
2656 {
2657 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002658 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002659 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002660 "character argument not in range(0x110000)");
2661 return NULL;
2662 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002663 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002664 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002665 break;
2666 }
2667
2668 case 'i':
2669 case 'd':
2670 case 'u':
2671 case 'x':
2672 {
2673 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002674 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002675 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002676
2677 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002678 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002679 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002680 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002681 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002682 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002683 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002684 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002685 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002686 va_arg(*vargs, size_t));
2687 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002688 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002689 va_arg(*vargs, unsigned int));
2690 }
2691 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002692 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002693 }
2694 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002695 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002696 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002697 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002698 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002699 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002700 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002701 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002702 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002703 va_arg(*vargs, Py_ssize_t));
2704 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002705 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002706 va_arg(*vargs, int));
2707 }
2708 assert(len >= 0);
2709
Victor Stinnere215d962012-10-06 23:03:36 +02002710 if (precision < len)
2711 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002712
2713 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002714 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2715 return NULL;
2716
Victor Stinnere215d962012-10-06 23:03:36 +02002717 if (width > precision) {
2718 Py_UCS4 fillchar;
2719 fill = width - precision;
2720 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002721 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2722 return NULL;
2723 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002724 }
Victor Stinner15a11362012-10-06 23:48:20 +02002725 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002726 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002727 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2728 return NULL;
2729 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002730 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002731
Victor Stinner4a587072013-11-19 12:54:53 +01002732 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2733 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002734 break;
2735 }
2736
2737 case 'p':
2738 {
2739 char number[MAX_LONG_LONG_CHARS];
2740
2741 len = sprintf(number, "%p", va_arg(*vargs, void*));
2742 assert(len >= 0);
2743
2744 /* %p is ill-defined: ensure leading 0x. */
2745 if (number[1] == 'X')
2746 number[1] = 'x';
2747 else if (number[1] != 'x') {
2748 memmove(number + 2, number,
2749 strlen(number) + 1);
2750 number[0] = '0';
2751 number[1] = 'x';
2752 len += 2;
2753 }
2754
Victor Stinner4a587072013-11-19 12:54:53 +01002755 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002756 return NULL;
2757 break;
2758 }
2759
2760 case 's':
2761 {
2762 /* UTF-8 */
2763 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002764 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002765 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002766 break;
2767 }
2768
2769 case 'U':
2770 {
2771 PyObject *obj = va_arg(*vargs, PyObject *);
2772 assert(obj && _PyUnicode_CHECK(obj));
2773
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002774 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002775 return NULL;
2776 break;
2777 }
2778
2779 case 'V':
2780 {
2781 PyObject *obj = va_arg(*vargs, PyObject *);
2782 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002783 if (obj) {
2784 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002785 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002786 return NULL;
2787 }
2788 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002789 assert(str != NULL);
2790 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002792 }
2793 break;
2794 }
2795
2796 case 'S':
2797 {
2798 PyObject *obj = va_arg(*vargs, PyObject *);
2799 PyObject *str;
2800 assert(obj);
2801 str = PyObject_Str(obj);
2802 if (!str)
2803 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002804 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002805 Py_DECREF(str);
2806 return NULL;
2807 }
2808 Py_DECREF(str);
2809 break;
2810 }
2811
2812 case 'R':
2813 {
2814 PyObject *obj = va_arg(*vargs, PyObject *);
2815 PyObject *repr;
2816 assert(obj);
2817 repr = PyObject_Repr(obj);
2818 if (!repr)
2819 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002820 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002821 Py_DECREF(repr);
2822 return NULL;
2823 }
2824 Py_DECREF(repr);
2825 break;
2826 }
2827
2828 case 'A':
2829 {
2830 PyObject *obj = va_arg(*vargs, PyObject *);
2831 PyObject *ascii;
2832 assert(obj);
2833 ascii = PyObject_ASCII(obj);
2834 if (!ascii)
2835 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002836 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002837 Py_DECREF(ascii);
2838 return NULL;
2839 }
2840 Py_DECREF(ascii);
2841 break;
2842 }
2843
2844 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002845 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002846 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002847 break;
2848
2849 default:
2850 /* if we stumble upon an unknown formatting code, copy the rest
2851 of the format string to the output string. (we cannot just
2852 skip the code, since there's no way to know what's in the
2853 argument list) */
2854 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002855 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002856 return NULL;
2857 f = p+len;
2858 return f;
2859 }
2860
2861 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002862 return f;
2863}
2864
Walter Dörwaldd2034312007-05-18 16:29:38 +00002865PyObject *
2866PyUnicode_FromFormatV(const char *format, va_list vargs)
2867{
Victor Stinnere215d962012-10-06 23:03:36 +02002868 va_list vargs2;
2869 const char *f;
2870 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002871
Victor Stinner8f674cc2013-04-17 23:02:17 +02002872 _PyUnicodeWriter_Init(&writer);
2873 writer.min_length = strlen(format) + 100;
2874 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002875
2876 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2877 Copy it to be able to pass a reference to a subfunction. */
2878 Py_VA_COPY(vargs2, vargs);
2879
2880 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002881 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002882 f = unicode_fromformat_arg(&writer, f, &vargs2);
2883 if (f == NULL)
2884 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002885 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002886 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002887 const char *p;
2888 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002889
Victor Stinnere215d962012-10-06 23:03:36 +02002890 p = f;
2891 do
2892 {
2893 if ((unsigned char)*p > 127) {
2894 PyErr_Format(PyExc_ValueError,
2895 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2896 "string, got a non-ASCII byte: 0x%02x",
2897 (unsigned char)*p);
2898 return NULL;
2899 }
2900 p++;
2901 }
2902 while (*p != '\0' && *p != '%');
2903 len = p - f;
2904
2905 if (*p == '\0')
2906 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002907
2908 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002909 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002910
2911 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002912 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002913 }
Victor Stinnere215d962012-10-06 23:03:36 +02002914 return _PyUnicodeWriter_Finish(&writer);
2915
2916 fail:
2917 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002918 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002919}
2920
Walter Dörwaldd2034312007-05-18 16:29:38 +00002921PyObject *
2922PyUnicode_FromFormat(const char *format, ...)
2923{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002924 PyObject* ret;
2925 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002926
2927#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002928 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002929#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002930 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002931#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002932 ret = PyUnicode_FromFormatV(format, vargs);
2933 va_end(vargs);
2934 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002935}
2936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002937#ifdef HAVE_WCHAR_H
2938
Victor Stinner5593d8a2010-10-02 11:11:27 +00002939/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2940 convert a Unicode object to a wide character string.
2941
Victor Stinnerd88d9832011-09-06 02:00:05 +02002942 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002943 character) required to convert the unicode object. Ignore size argument.
2944
Victor Stinnerd88d9832011-09-06 02:00:05 +02002945 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002946 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002947 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002948static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002949unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002950 wchar_t *w,
2951 Py_ssize_t size)
2952{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002953 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002954 const wchar_t *wstr;
2955
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002956 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002957 if (wstr == NULL)
2958 return -1;
2959
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002961 if (size > res)
2962 size = res + 1;
2963 else
2964 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002965 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002966 return res;
2967 }
2968 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002969 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002970}
2971
2972Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002973PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002974 wchar_t *w,
2975 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976{
2977 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 PyErr_BadInternalCall();
2979 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002981 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982}
2983
Victor Stinner137c34c2010-09-29 10:25:54 +00002984wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002985PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002986 Py_ssize_t *size)
2987{
2988 wchar_t* buffer;
2989 Py_ssize_t buflen;
2990
2991 if (unicode == NULL) {
2992 PyErr_BadInternalCall();
2993 return NULL;
2994 }
2995
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002996 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002997 if (buflen == -1)
2998 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002999 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003000 if (buffer == NULL) {
3001 PyErr_NoMemory();
3002 return NULL;
3003 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003004 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003005 if (buflen == -1) {
3006 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003007 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003008 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003009 if (size != NULL)
3010 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003011 return buffer;
3012}
3013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003014#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015
Alexander Belopolsky40018472011-02-26 01:02:56 +00003016PyObject *
3017PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003018{
Victor Stinner8faf8212011-12-08 22:14:11 +01003019 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003020 PyErr_SetString(PyExc_ValueError,
3021 "chr() arg not in range(0x110000)");
3022 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003023 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003024
Victor Stinner985a82a2014-01-03 12:53:47 +01003025 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003026}
3027
Alexander Belopolsky40018472011-02-26 01:02:56 +00003028PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003029PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003031 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003032 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003033 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003034 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003035 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003036 Py_INCREF(obj);
3037 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003038 }
3039 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003040 /* For a Unicode subtype that's not a Unicode object,
3041 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003042 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003043 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003044 PyErr_Format(PyExc_TypeError,
3045 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003046 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003047 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003048}
3049
Alexander Belopolsky40018472011-02-26 01:02:56 +00003050PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003051PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003052 const char *encoding,
3053 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003054{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003055 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003056 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003057
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003059 PyErr_BadInternalCall();
3060 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003062
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003063 /* Decoding bytes objects is the most common case and should be fast */
3064 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003065 if (PyBytes_GET_SIZE(obj) == 0)
3066 _Py_RETURN_UNICODE_EMPTY();
3067 v = PyUnicode_Decode(
3068 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3069 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003070 return v;
3071 }
3072
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003073 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003074 PyErr_SetString(PyExc_TypeError,
3075 "decoding str is not supported");
3076 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003077 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003078
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003079 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3080 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3081 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003082 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003083 Py_TYPE(obj)->tp_name);
3084 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003085 }
Tim Petersced69f82003-09-16 20:30:58 +00003086
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003087 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003088 PyBuffer_Release(&buffer);
3089 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003091
Serhiy Storchaka05997252013-01-26 12:14:02 +02003092 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003093 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003094 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095}
3096
Victor Stinner942889a2016-09-05 15:40:10 -07003097/* Normalize an encoding name: C implementation of
3098 encodings.normalize_encoding(). Return 1 on success, or 0 on error (encoding
3099 is longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003100int
3101_Py_normalize_encoding(const char *encoding,
3102 char *lower,
3103 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003105 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003106 char *l;
3107 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003108 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003109
Victor Stinner942889a2016-09-05 15:40:10 -07003110 assert(encoding != NULL);
3111
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003112 e = encoding;
3113 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003114 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003115 punct = 0;
3116 while (1) {
3117 char c = *e;
3118 if (c == 0) {
3119 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003120 }
Victor Stinner942889a2016-09-05 15:40:10 -07003121
3122 if (Py_ISALNUM(c) || c == '.') {
3123 if (punct && l != lower) {
3124 if (l == l_end) {
3125 return 0;
3126 }
3127 *l++ = '_';
3128 }
3129 punct = 0;
3130
3131 if (l == l_end) {
3132 return 0;
3133 }
3134 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003135 }
3136 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003137 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003138 }
Victor Stinner942889a2016-09-05 15:40:10 -07003139
3140 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003141 }
3142 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003143 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003144}
3145
Alexander Belopolsky40018472011-02-26 01:02:56 +00003146PyObject *
3147PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003148 Py_ssize_t size,
3149 const char *encoding,
3150 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003151{
3152 PyObject *buffer = NULL, *unicode;
3153 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003154 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3155
3156 if (encoding == NULL) {
3157 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3158 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003159
Fred Drakee4315f52000-05-09 19:53:39 +00003160 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003161 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3162 char *lower = buflower;
3163
3164 /* Fast paths */
3165 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3166 lower += 3;
3167 if (*lower == '_') {
3168 /* Match "utf8" and "utf_8" */
3169 lower++;
3170 }
3171
3172 if (lower[0] == '8' && lower[1] == 0) {
3173 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3174 }
3175 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3176 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3177 }
3178 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3179 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3180 }
3181 }
3182 else {
3183 if (strcmp(lower, "ascii") == 0
3184 || strcmp(lower, "us_ascii") == 0) {
3185 return PyUnicode_DecodeASCII(s, size, errors);
3186 }
3187 #ifdef HAVE_MBCS
3188 else if (strcmp(lower, "mbcs") == 0) {
3189 return PyUnicode_DecodeMBCS(s, size, errors);
3190 }
3191 #endif
3192 else if (strcmp(lower, "latin1") == 0
3193 || strcmp(lower, "latin_1") == 0
3194 || strcmp(lower, "iso_8859_1") == 0
3195 || strcmp(lower, "iso8859_1") == 0) {
3196 return PyUnicode_DecodeLatin1(s, size, errors);
3197 }
3198 }
Victor Stinner37296e82010-06-10 13:36:23 +00003199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200
3201 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003202 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003203 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003204 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003205 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 if (buffer == NULL)
3207 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003208 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 if (unicode == NULL)
3210 goto onError;
3211 if (!PyUnicode_Check(unicode)) {
3212 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003213 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3214 "use codecs.decode() to decode to arbitrary types",
3215 encoding,
3216 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217 Py_DECREF(unicode);
3218 goto onError;
3219 }
3220 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003221 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003222
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 Py_XDECREF(buffer);
3225 return NULL;
3226}
3227
Alexander Belopolsky40018472011-02-26 01:02:56 +00003228PyObject *
3229PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003230 const char *encoding,
3231 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003232{
3233 PyObject *v;
3234
3235 if (!PyUnicode_Check(unicode)) {
3236 PyErr_BadArgument();
3237 goto onError;
3238 }
3239
3240 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003241 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003242
3243 /* Decode via the codec registry */
3244 v = PyCodec_Decode(unicode, encoding, errors);
3245 if (v == NULL)
3246 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003247 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003248
Benjamin Peterson29060642009-01-31 22:14:21 +00003249 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003250 return NULL;
3251}
3252
Alexander Belopolsky40018472011-02-26 01:02:56 +00003253PyObject *
3254PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003255 const char *encoding,
3256 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003257{
3258 PyObject *v;
3259
3260 if (!PyUnicode_Check(unicode)) {
3261 PyErr_BadArgument();
3262 goto onError;
3263 }
3264
3265 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003266 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003267
3268 /* Decode via the codec registry */
3269 v = PyCodec_Decode(unicode, encoding, errors);
3270 if (v == NULL)
3271 goto onError;
3272 if (!PyUnicode_Check(v)) {
3273 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003274 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3275 "use codecs.decode() to decode to arbitrary types",
3276 encoding,
3277 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003278 Py_DECREF(v);
3279 goto onError;
3280 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003281 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003282
Benjamin Peterson29060642009-01-31 22:14:21 +00003283 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003284 return NULL;
3285}
3286
Alexander Belopolsky40018472011-02-26 01:02:56 +00003287PyObject *
3288PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003289 Py_ssize_t size,
3290 const char *encoding,
3291 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292{
3293 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003294
Guido van Rossumd57fd912000-03-10 22:53:23 +00003295 unicode = PyUnicode_FromUnicode(s, size);
3296 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003297 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3299 Py_DECREF(unicode);
3300 return v;
3301}
3302
Alexander Belopolsky40018472011-02-26 01:02:56 +00003303PyObject *
3304PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003305 const char *encoding,
3306 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003307{
3308 PyObject *v;
3309
3310 if (!PyUnicode_Check(unicode)) {
3311 PyErr_BadArgument();
3312 goto onError;
3313 }
3314
3315 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003317
3318 /* Encode via the codec registry */
3319 v = PyCodec_Encode(unicode, encoding, errors);
3320 if (v == NULL)
3321 goto onError;
3322 return v;
3323
Benjamin Peterson29060642009-01-31 22:14:21 +00003324 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003325 return NULL;
3326}
3327
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003328static size_t
3329wcstombs_errorpos(const wchar_t *wstr)
3330{
3331 size_t len;
3332#if SIZEOF_WCHAR_T == 2
3333 wchar_t buf[3];
3334#else
3335 wchar_t buf[2];
3336#endif
3337 char outbuf[MB_LEN_MAX];
3338 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003339
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003340#if SIZEOF_WCHAR_T == 2
3341 buf[2] = 0;
3342#else
3343 buf[1] = 0;
3344#endif
3345 start = wstr;
3346 while (*wstr != L'\0')
3347 {
3348 previous = wstr;
3349#if SIZEOF_WCHAR_T == 2
3350 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3351 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3352 {
3353 buf[0] = wstr[0];
3354 buf[1] = wstr[1];
3355 wstr += 2;
3356 }
3357 else {
3358 buf[0] = *wstr;
3359 buf[1] = 0;
3360 wstr++;
3361 }
3362#else
3363 buf[0] = *wstr;
3364 wstr++;
3365#endif
3366 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003367 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003368 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003369 }
3370
3371 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003372 return 0;
3373}
3374
Victor Stinner1b579672011-12-17 05:47:23 +01003375static int
3376locale_error_handler(const char *errors, int *surrogateescape)
3377{
Victor Stinner50149202015-09-22 00:26:54 +02003378 _Py_error_handler error_handler = get_error_handler(errors);
3379 switch (error_handler)
3380 {
3381 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003382 *surrogateescape = 0;
3383 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003384 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003385 *surrogateescape = 1;
3386 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003387 default:
3388 PyErr_Format(PyExc_ValueError,
3389 "only 'strict' and 'surrogateescape' error handlers "
3390 "are supported, not '%s'",
3391 errors);
3392 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003393 }
Victor Stinner1b579672011-12-17 05:47:23 +01003394}
3395
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003396PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003397PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003398{
3399 Py_ssize_t wlen, wlen2;
3400 wchar_t *wstr;
3401 PyObject *bytes = NULL;
3402 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003403 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003404 PyObject *exc;
3405 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003406 int surrogateescape;
3407
3408 if (locale_error_handler(errors, &surrogateescape) < 0)
3409 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003410
3411 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3412 if (wstr == NULL)
3413 return NULL;
3414
3415 wlen2 = wcslen(wstr);
3416 if (wlen2 != wlen) {
3417 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003418 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003419 return NULL;
3420 }
3421
3422 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003423 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003424 char *str;
3425
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003426 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003427 if (str == NULL) {
3428 if (error_pos == (size_t)-1) {
3429 PyErr_NoMemory();
3430 PyMem_Free(wstr);
3431 return NULL;
3432 }
3433 else {
3434 goto encode_error;
3435 }
3436 }
3437 PyMem_Free(wstr);
3438
3439 bytes = PyBytes_FromString(str);
3440 PyMem_Free(str);
3441 }
3442 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003443 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003444 size_t len, len2;
3445
3446 len = wcstombs(NULL, wstr, 0);
3447 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003448 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003449 goto encode_error;
3450 }
3451
3452 bytes = PyBytes_FromStringAndSize(NULL, len);
3453 if (bytes == NULL) {
3454 PyMem_Free(wstr);
3455 return NULL;
3456 }
3457
3458 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3459 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003460 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003461 goto encode_error;
3462 }
3463 PyMem_Free(wstr);
3464 }
3465 return bytes;
3466
3467encode_error:
3468 errmsg = strerror(errno);
3469 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003470
3471 if (error_pos == (size_t)-1)
3472 error_pos = wcstombs_errorpos(wstr);
3473
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003474 PyMem_Free(wstr);
3475 Py_XDECREF(bytes);
3476
Victor Stinner2f197072011-12-17 07:08:30 +01003477 if (errmsg != NULL) {
3478 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003479 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003480 if (wstr != NULL) {
3481 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003482 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003483 } else
3484 errmsg = NULL;
3485 }
3486 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003487 reason = PyUnicode_FromString(
3488 "wcstombs() encountered an unencodable "
3489 "wide character");
3490 if (reason == NULL)
3491 return NULL;
3492
3493 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3494 "locale", unicode,
3495 (Py_ssize_t)error_pos,
3496 (Py_ssize_t)(error_pos+1),
3497 reason);
3498 Py_DECREF(reason);
3499 if (exc != NULL) {
3500 PyCodec_StrictErrors(exc);
3501 Py_XDECREF(exc);
3502 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003503 return NULL;
3504}
3505
Victor Stinnerad158722010-10-27 00:25:46 +00003506PyObject *
3507PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003508{
Victor Stinner99b95382011-07-04 14:23:54 +02003509#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003510 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003511#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003512 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003513#else
Victor Stinner793b5312011-04-27 00:24:21 +02003514 PyInterpreterState *interp = PyThreadState_GET()->interp;
3515 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3516 cannot use it to encode and decode filenames before it is loaded. Load
3517 the Python codec requires to encode at least its own filename. Use the C
3518 version of the locale codec until the codec registry is initialized and
3519 the Python codec is loaded.
3520
3521 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3522 cannot only rely on it: check also interp->fscodec_initialized for
3523 subinterpreters. */
3524 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003525 return PyUnicode_AsEncodedString(unicode,
3526 Py_FileSystemDefaultEncoding,
3527 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003528 }
3529 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003530 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003531 }
Victor Stinnerad158722010-10-27 00:25:46 +00003532#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003533}
3534
Alexander Belopolsky40018472011-02-26 01:02:56 +00003535PyObject *
3536PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003537 const char *encoding,
3538 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539{
3540 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003541 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003542
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543 if (!PyUnicode_Check(unicode)) {
3544 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003545 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546 }
Fred Drakee4315f52000-05-09 19:53:39 +00003547
Victor Stinner942889a2016-09-05 15:40:10 -07003548 if (encoding == NULL) {
3549 return _PyUnicode_AsUTF8String(unicode, errors);
3550 }
3551
Fred Drakee4315f52000-05-09 19:53:39 +00003552 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003553 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3554 char *lower = buflower;
3555
3556 /* Fast paths */
3557 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3558 lower += 3;
3559 if (*lower == '_') {
3560 /* Match "utf8" and "utf_8" */
3561 lower++;
3562 }
3563
3564 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003565 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003566 }
3567 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3568 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3569 }
3570 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3571 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3572 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003573 }
Victor Stinner942889a2016-09-05 15:40:10 -07003574 else {
3575 if (strcmp(lower, "ascii") == 0
3576 || strcmp(lower, "us_ascii") == 0) {
3577 return _PyUnicode_AsASCIIString(unicode, errors);
3578 }
Victor Stinner99b95382011-07-04 14:23:54 +02003579#ifdef HAVE_MBCS
Victor Stinner942889a2016-09-05 15:40:10 -07003580 else if (strcmp(lower, "mbcs") == 0) {
3581 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3582 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003583#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003584 else if (strcmp(lower, "latin1") == 0 ||
3585 strcmp(lower, "latin_1") == 0 ||
3586 strcmp(lower, "iso_8859_1") == 0 ||
3587 strcmp(lower, "iso8859_1") == 0) {
3588 return _PyUnicode_AsLatin1String(unicode, errors);
3589 }
3590 }
Victor Stinner37296e82010-06-10 13:36:23 +00003591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592
3593 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003594 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003595 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003596 return NULL;
3597
3598 /* The normal path */
3599 if (PyBytes_Check(v))
3600 return v;
3601
3602 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003603 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003604 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003605 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003606
3607 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003608 "encoder %s returned bytearray instead of bytes; "
3609 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003610 encoding);
3611 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003612 Py_DECREF(v);
3613 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003614 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003615
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003616 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3617 Py_DECREF(v);
3618 return b;
3619 }
3620
3621 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003622 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3623 "use codecs.encode() to encode to arbitrary types",
3624 encoding,
3625 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003626 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003627 return NULL;
3628}
3629
Alexander Belopolsky40018472011-02-26 01:02:56 +00003630PyObject *
3631PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003632 const char *encoding,
3633 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003634{
3635 PyObject *v;
3636
3637 if (!PyUnicode_Check(unicode)) {
3638 PyErr_BadArgument();
3639 goto onError;
3640 }
3641
3642 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003643 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003644
3645 /* Encode via the codec registry */
3646 v = PyCodec_Encode(unicode, encoding, errors);
3647 if (v == NULL)
3648 goto onError;
3649 if (!PyUnicode_Check(v)) {
3650 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003651 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3652 "use codecs.encode() to encode to arbitrary types",
3653 encoding,
3654 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003655 Py_DECREF(v);
3656 goto onError;
3657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003659
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 return NULL;
3662}
3663
Victor Stinner2f197072011-12-17 07:08:30 +01003664static size_t
3665mbstowcs_errorpos(const char *str, size_t len)
3666{
3667#ifdef HAVE_MBRTOWC
3668 const char *start = str;
3669 mbstate_t mbs;
3670 size_t converted;
3671 wchar_t ch;
3672
3673 memset(&mbs, 0, sizeof mbs);
3674 while (len)
3675 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003676 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003677 if (converted == 0)
3678 /* Reached end of string */
3679 break;
3680 if (converted == (size_t)-1 || converted == (size_t)-2) {
3681 /* Conversion error or incomplete character */
3682 return str - start;
3683 }
3684 else {
3685 str += converted;
3686 len -= converted;
3687 }
3688 }
3689 /* failed to find the undecodable byte sequence */
3690 return 0;
3691#endif
3692 return 0;
3693}
3694
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003695PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003696PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003697 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003698{
3699 wchar_t smallbuf[256];
3700 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3701 wchar_t *wstr;
3702 size_t wlen, wlen2;
3703 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003704 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003705 size_t error_pos;
3706 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003707 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3708 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003709
3710 if (locale_error_handler(errors, &surrogateescape) < 0)
3711 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003712
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003713 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3714 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003715 return NULL;
3716 }
3717
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003718 if (surrogateescape) {
3719 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003720 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003721 if (wstr == NULL) {
3722 if (wlen == (size_t)-1)
3723 PyErr_NoMemory();
3724 else
3725 PyErr_SetFromErrno(PyExc_OSError);
3726 return NULL;
3727 }
3728
3729 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003730 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003731 }
3732 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003733 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003734#ifndef HAVE_BROKEN_MBSTOWCS
3735 wlen = mbstowcs(NULL, str, 0);
3736#else
3737 wlen = len;
3738#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003739 if (wlen == (size_t)-1)
3740 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003741 if (wlen+1 <= smallbuf_len) {
3742 wstr = smallbuf;
3743 }
3744 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003745 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003746 if (!wstr)
3747 return PyErr_NoMemory();
3748 }
3749
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003750 wlen2 = mbstowcs(wstr, str, wlen+1);
3751 if (wlen2 == (size_t)-1) {
3752 if (wstr != smallbuf)
3753 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003754 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003755 }
3756#ifdef HAVE_BROKEN_MBSTOWCS
3757 assert(wlen2 == wlen);
3758#endif
3759 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3760 if (wstr != smallbuf)
3761 PyMem_Free(wstr);
3762 }
3763 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003764
3765decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003766 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003767 errmsg = strerror(errno);
3768 assert(errmsg != NULL);
3769
3770 error_pos = mbstowcs_errorpos(str, len);
3771 if (errmsg != NULL) {
3772 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003773 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003774 if (wstr != NULL) {
3775 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003776 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003777 }
Victor Stinner2f197072011-12-17 07:08:30 +01003778 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003779 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003780 reason = PyUnicode_FromString(
3781 "mbstowcs() encountered an invalid multibyte sequence");
3782 if (reason == NULL)
3783 return NULL;
3784
3785 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3786 "locale", str, len,
3787 (Py_ssize_t)error_pos,
3788 (Py_ssize_t)(error_pos+1),
3789 reason);
3790 Py_DECREF(reason);
3791 if (exc != NULL) {
3792 PyCodec_StrictErrors(exc);
3793 Py_XDECREF(exc);
3794 }
3795 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003796}
3797
3798PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003799PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003800{
3801 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003802 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003803}
3804
3805
3806PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003807PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003808 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003809 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3810}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003811
Christian Heimes5894ba72007-11-04 11:43:14 +00003812PyObject*
3813PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3814{
Victor Stinner99b95382011-07-04 14:23:54 +02003815#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003816 return PyUnicode_DecodeMBCS(s, size, NULL);
3817#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003818 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003819#else
Victor Stinner793b5312011-04-27 00:24:21 +02003820 PyInterpreterState *interp = PyThreadState_GET()->interp;
3821 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3822 cannot use it to encode and decode filenames before it is loaded. Load
3823 the Python codec requires to encode at least its own filename. Use the C
3824 version of the locale codec until the codec registry is initialized and
3825 the Python codec is loaded.
3826
3827 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3828 cannot only rely on it: check also interp->fscodec_initialized for
3829 subinterpreters. */
3830 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003831 return PyUnicode_Decode(s, size,
3832 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003833 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003834 }
3835 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003836 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003837 }
Victor Stinnerad158722010-10-27 00:25:46 +00003838#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003839}
3840
Martin v. Löwis011e8422009-05-05 04:43:17 +00003841
3842int
3843PyUnicode_FSConverter(PyObject* arg, void* addr)
3844{
3845 PyObject *output = NULL;
3846 Py_ssize_t size;
3847 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003848 if (arg == NULL) {
3849 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003850 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003851 return 1;
3852 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003853 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003854 output = arg;
3855 Py_INCREF(output);
3856 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003857 else if (PyUnicode_Check(arg)) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003858 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003859 if (!output)
3860 return 0;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003861 assert(PyBytes_Check(output));
3862 }
3863 else {
3864 PyErr_Format(PyExc_TypeError,
3865 "must be str or bytes, not %.100s",
3866 Py_TYPE(arg)->tp_name);
3867 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003868 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003869 size = PyBytes_GET_SIZE(output);
3870 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003871 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003872 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003873 Py_DECREF(output);
3874 return 0;
3875 }
3876 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003877 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003878}
3879
3880
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003881int
3882PyUnicode_FSDecoder(PyObject* arg, void* addr)
3883{
3884 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003885 if (arg == NULL) {
3886 Py_DECREF(*(PyObject**)addr);
3887 return 1;
3888 }
3889 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003890 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003891 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003892 output = arg;
3893 Py_INCREF(output);
3894 }
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003895 else if (PyBytes_Check(arg) || PyObject_CheckBuffer(arg)) {
3896 if (!PyBytes_Check(arg) &&
3897 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3898 "path should be string or bytes, not %.200s",
3899 Py_TYPE(arg)->tp_name)) {
3900 return 0;
3901 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003902 arg = PyBytes_FromObject(arg);
3903 if (!arg)
3904 return 0;
3905 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3906 PyBytes_GET_SIZE(arg));
3907 Py_DECREF(arg);
3908 if (!output)
3909 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003910 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003911 else {
3912 PyErr_Format(PyExc_TypeError,
3913 "path should be string or bytes, not %.200s",
3914 Py_TYPE(arg)->tp_name);
3915 return 0;
3916 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003917 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003918 Py_DECREF(output);
3919 return 0;
3920 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003921 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003922 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003923 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003924 Py_DECREF(output);
3925 return 0;
3926 }
3927 *(PyObject**)addr = output;
3928 return Py_CLEANUP_SUPPORTED;
3929}
3930
3931
Martin v. Löwis5b222132007-06-10 09:51:05 +00003932char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003934{
Christian Heimesf3863112007-11-22 07:46:41 +00003935 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003936
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003937 if (!PyUnicode_Check(unicode)) {
3938 PyErr_BadArgument();
3939 return NULL;
3940 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003941 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003942 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003944 if (PyUnicode_UTF8(unicode) == NULL) {
3945 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003946 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947 if (bytes == NULL)
3948 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003949 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3950 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003951 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952 Py_DECREF(bytes);
3953 return NULL;
3954 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003955 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3956 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3957 PyBytes_AS_STRING(bytes),
3958 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003959 Py_DECREF(bytes);
3960 }
3961
3962 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003963 *psize = PyUnicode_UTF8_LENGTH(unicode);
3964 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003965}
3966
3967char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003968PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003970 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3971}
3972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973Py_UNICODE *
3974PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3975{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976 const unsigned char *one_byte;
3977#if SIZEOF_WCHAR_T == 4
3978 const Py_UCS2 *two_bytes;
3979#else
3980 const Py_UCS4 *four_bytes;
3981 const Py_UCS4 *ucs4_end;
3982 Py_ssize_t num_surrogates;
3983#endif
3984 wchar_t *w;
3985 wchar_t *wchar_end;
3986
3987 if (!PyUnicode_Check(unicode)) {
3988 PyErr_BadArgument();
3989 return NULL;
3990 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003991 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003993 assert(_PyUnicode_KIND(unicode) != 0);
3994 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003996 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003997#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003998 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3999 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004000 num_surrogates = 0;
4001
4002 for (; four_bytes < ucs4_end; ++four_bytes) {
4003 if (*four_bytes > 0xFFFF)
4004 ++num_surrogates;
4005 }
4006
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004007 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4008 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4009 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010 PyErr_NoMemory();
4011 return NULL;
4012 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004013 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004015 w = _PyUnicode_WSTR(unicode);
4016 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4017 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4019 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004020 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004021 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004022 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4023 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004024 }
4025 else
4026 *w = *four_bytes;
4027
4028 if (w > wchar_end) {
4029 assert(0 && "Miscalculated string end");
4030 }
4031 }
4032 *w = 0;
4033#else
4034 /* sizeof(wchar_t) == 4 */
4035 Py_FatalError("Impossible unicode object state, wstr and str "
4036 "should share memory already.");
4037 return NULL;
4038#endif
4039 }
4040 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004041 if ((size_t)_PyUnicode_LENGTH(unicode) >
4042 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4043 PyErr_NoMemory();
4044 return NULL;
4045 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004046 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4047 (_PyUnicode_LENGTH(unicode) + 1));
4048 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004049 PyErr_NoMemory();
4050 return NULL;
4051 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004052 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4053 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4054 w = _PyUnicode_WSTR(unicode);
4055 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004056
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004057 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4058 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004059 for (; w < wchar_end; ++one_byte, ++w)
4060 *w = *one_byte;
4061 /* null-terminate the wstr */
4062 *w = 0;
4063 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004064 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004065#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004066 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004067 for (; w < wchar_end; ++two_bytes, ++w)
4068 *w = *two_bytes;
4069 /* null-terminate the wstr */
4070 *w = 0;
4071#else
4072 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004073 PyObject_FREE(_PyUnicode_WSTR(unicode));
4074 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004075 Py_FatalError("Impossible unicode object state, wstr "
4076 "and str should share memory already.");
4077 return NULL;
4078#endif
4079 }
4080 else {
4081 assert(0 && "This should never happen.");
4082 }
4083 }
4084 }
4085 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004086 *size = PyUnicode_WSTR_LENGTH(unicode);
4087 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004088}
4089
Alexander Belopolsky40018472011-02-26 01:02:56 +00004090Py_UNICODE *
4091PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004093 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094}
4095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004096
Alexander Belopolsky40018472011-02-26 01:02:56 +00004097Py_ssize_t
4098PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099{
4100 if (!PyUnicode_Check(unicode)) {
4101 PyErr_BadArgument();
4102 goto onError;
4103 }
4104 return PyUnicode_GET_SIZE(unicode);
4105
Benjamin Peterson29060642009-01-31 22:14:21 +00004106 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 return -1;
4108}
4109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004110Py_ssize_t
4111PyUnicode_GetLength(PyObject *unicode)
4112{
Victor Stinner07621332012-06-16 04:53:46 +02004113 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004114 PyErr_BadArgument();
4115 return -1;
4116 }
Victor Stinner07621332012-06-16 04:53:46 +02004117 if (PyUnicode_READY(unicode) == -1)
4118 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004119 return PyUnicode_GET_LENGTH(unicode);
4120}
4121
4122Py_UCS4
4123PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4124{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004125 void *data;
4126 int kind;
4127
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004128 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4129 PyErr_BadArgument();
4130 return (Py_UCS4)-1;
4131 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004132 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004133 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004134 return (Py_UCS4)-1;
4135 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004136 data = PyUnicode_DATA(unicode);
4137 kind = PyUnicode_KIND(unicode);
4138 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004139}
4140
4141int
4142PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4143{
4144 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004145 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004146 return -1;
4147 }
Victor Stinner488fa492011-12-12 00:01:39 +01004148 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004149 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004150 PyErr_SetString(PyExc_IndexError, "string index out of range");
4151 return -1;
4152 }
Victor Stinner488fa492011-12-12 00:01:39 +01004153 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004154 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004155 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4156 PyErr_SetString(PyExc_ValueError, "character out of range");
4157 return -1;
4158 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004159 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4160 index, ch);
4161 return 0;
4162}
4163
Alexander Belopolsky40018472011-02-26 01:02:56 +00004164const char *
4165PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004166{
Victor Stinner42cb4622010-09-01 19:39:01 +00004167 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004168}
4169
Victor Stinner554f3f02010-06-16 23:33:54 +00004170/* create or adjust a UnicodeDecodeError */
4171static void
4172make_decode_exception(PyObject **exceptionObject,
4173 const char *encoding,
4174 const char *input, Py_ssize_t length,
4175 Py_ssize_t startpos, Py_ssize_t endpos,
4176 const char *reason)
4177{
4178 if (*exceptionObject == NULL) {
4179 *exceptionObject = PyUnicodeDecodeError_Create(
4180 encoding, input, length, startpos, endpos, reason);
4181 }
4182 else {
4183 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4184 goto onError;
4185 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4186 goto onError;
4187 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4188 goto onError;
4189 }
4190 return;
4191
4192onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004193 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004194}
4195
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004196#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197/* error handling callback helper:
4198 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004199 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200 and adjust various state variables.
4201 return 0 on success, -1 on error
4202*/
4203
Alexander Belopolsky40018472011-02-26 01:02:56 +00004204static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004205unicode_decode_call_errorhandler_wchar(
4206 const char *errors, PyObject **errorHandler,
4207 const char *encoding, const char *reason,
4208 const char **input, const char **inend, Py_ssize_t *startinpos,
4209 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4210 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004212 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213
4214 PyObject *restuple = NULL;
4215 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004216 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004217 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004218 Py_ssize_t requiredsize;
4219 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004220 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004221 wchar_t *repwstr;
4222 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004224 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4225 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004226
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004227 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004228 *errorHandler = PyCodec_LookupError(errors);
4229 if (*errorHandler == NULL)
4230 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004231 }
4232
Victor Stinner554f3f02010-06-16 23:33:54 +00004233 make_decode_exception(exceptionObject,
4234 encoding,
4235 *input, *inend - *input,
4236 *startinpos, *endinpos,
4237 reason);
4238 if (*exceptionObject == NULL)
4239 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240
4241 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4242 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004243 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004245 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247 }
4248 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004249 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004250
4251 /* Copy back the bytes variables, which might have been modified by the
4252 callback */
4253 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4254 if (!inputobj)
4255 goto onError;
4256 if (!PyBytes_Check(inputobj)) {
4257 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4258 }
4259 *input = PyBytes_AS_STRING(inputobj);
4260 insize = PyBytes_GET_SIZE(inputobj);
4261 *inend = *input + insize;
4262 /* we can DECREF safely, as the exception has another reference,
4263 so the object won't go away. */
4264 Py_DECREF(inputobj);
4265
4266 if (newpos<0)
4267 newpos = insize+newpos;
4268 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004269 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004270 goto onError;
4271 }
4272
4273 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4274 if (repwstr == NULL)
4275 goto onError;
4276 /* need more space? (at least enough for what we
4277 have+the replacement+the rest of the string (starting
4278 at the new input position), so we won't have to check space
4279 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004280 requiredsize = *outpos;
4281 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4282 goto overflow;
4283 requiredsize += repwlen;
4284 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4285 goto overflow;
4286 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004287 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004288 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004289 requiredsize = 2*outsize;
4290 if (unicode_resize(output, requiredsize) < 0)
4291 goto onError;
4292 }
4293 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4294 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004295 *endinpos = newpos;
4296 *inptr = *input + newpos;
4297
4298 /* we made it! */
4299 Py_XDECREF(restuple);
4300 return 0;
4301
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004302 overflow:
4303 PyErr_SetString(PyExc_OverflowError,
4304 "decoded result is too long for a Python string");
4305
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004306 onError:
4307 Py_XDECREF(restuple);
4308 return -1;
4309}
4310#endif /* HAVE_MBCS */
4311
4312static int
4313unicode_decode_call_errorhandler_writer(
4314 const char *errors, PyObject **errorHandler,
4315 const char *encoding, const char *reason,
4316 const char **input, const char **inend, Py_ssize_t *startinpos,
4317 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4318 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4319{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004320 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004321
4322 PyObject *restuple = NULL;
4323 PyObject *repunicode = NULL;
4324 Py_ssize_t insize;
4325 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004326 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004327 PyObject *inputobj = NULL;
4328
4329 if (*errorHandler == NULL) {
4330 *errorHandler = PyCodec_LookupError(errors);
4331 if (*errorHandler == NULL)
4332 goto onError;
4333 }
4334
4335 make_decode_exception(exceptionObject,
4336 encoding,
4337 *input, *inend - *input,
4338 *startinpos, *endinpos,
4339 reason);
4340 if (*exceptionObject == NULL)
4341 goto onError;
4342
4343 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4344 if (restuple == NULL)
4345 goto onError;
4346 if (!PyTuple_Check(restuple)) {
4347 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4348 goto onError;
4349 }
4350 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004351 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004352
4353 /* Copy back the bytes variables, which might have been modified by the
4354 callback */
4355 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4356 if (!inputobj)
4357 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004358 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004359 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004360 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004361 *input = PyBytes_AS_STRING(inputobj);
4362 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004363 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004364 /* we can DECREF safely, as the exception has another reference,
4365 so the object won't go away. */
4366 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004367
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004368 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004369 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004370 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004371 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004372 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004373 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004374
Victor Stinner8f674cc2013-04-17 23:02:17 +02004375 if (PyUnicode_READY(repunicode) < 0)
4376 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004377 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004378 if (replen > 1) {
4379 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004380 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004381 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4382 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4383 goto onError;
4384 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004385 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004386 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004387
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004388 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004389 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004392 Py_XDECREF(restuple);
4393 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004397 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398}
4399
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004400/* --- UTF-7 Codec -------------------------------------------------------- */
4401
Antoine Pitrou244651a2009-05-04 18:56:13 +00004402/* See RFC2152 for details. We encode conservatively and decode liberally. */
4403
4404/* Three simple macros defining base-64. */
4405
4406/* Is c a base-64 character? */
4407
4408#define IS_BASE64(c) \
4409 (((c) >= 'A' && (c) <= 'Z') || \
4410 ((c) >= 'a' && (c) <= 'z') || \
4411 ((c) >= '0' && (c) <= '9') || \
4412 (c) == '+' || (c) == '/')
4413
4414/* given that c is a base-64 character, what is its base-64 value? */
4415
4416#define FROM_BASE64(c) \
4417 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4418 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4419 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4420 (c) == '+' ? 62 : 63)
4421
4422/* What is the base-64 character of the bottom 6 bits of n? */
4423
4424#define TO_BASE64(n) \
4425 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4426
4427/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4428 * decoded as itself. We are permissive on decoding; the only ASCII
4429 * byte not decoding to itself is the + which begins a base64
4430 * string. */
4431
4432#define DECODE_DIRECT(c) \
4433 ((c) <= 127 && (c) != '+')
4434
4435/* The UTF-7 encoder treats ASCII characters differently according to
4436 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4437 * the above). See RFC2152. This array identifies these different
4438 * sets:
4439 * 0 : "Set D"
4440 * alphanumeric and '(),-./:?
4441 * 1 : "Set O"
4442 * !"#$%&*;<=>@[]^_`{|}
4443 * 2 : "whitespace"
4444 * ht nl cr sp
4445 * 3 : special (must be base64 encoded)
4446 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4447 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004448
Tim Petersced69f82003-09-16 20:30:58 +00004449static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004450char utf7_category[128] = {
4451/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4452 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4453/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4454 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4455/* sp ! " # $ % & ' ( ) * + , - . / */
4456 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4457/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4458 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4459/* @ A B C D E F G H I J K L M N O */
4460 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4461/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4462 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4463/* ` a b c d e f g h i j k l m n o */
4464 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4465/* p q r s t u v w x y z { | } ~ del */
4466 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004467};
4468
Antoine Pitrou244651a2009-05-04 18:56:13 +00004469/* ENCODE_DIRECT: this character should be encoded as itself. The
4470 * answer depends on whether we are encoding set O as itself, and also
4471 * on whether we are encoding whitespace as itself. RFC2152 makes it
4472 * clear that the answers to these questions vary between
4473 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004474
Antoine Pitrou244651a2009-05-04 18:56:13 +00004475#define ENCODE_DIRECT(c, directO, directWS) \
4476 ((c) < 128 && (c) > 0 && \
4477 ((utf7_category[(c)] == 0) || \
4478 (directWS && (utf7_category[(c)] == 2)) || \
4479 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480
Alexander Belopolsky40018472011-02-26 01:02:56 +00004481PyObject *
4482PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004483 Py_ssize_t size,
4484 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004485{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004486 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4487}
4488
Antoine Pitrou244651a2009-05-04 18:56:13 +00004489/* The decoder. The only state we preserve is our read position,
4490 * i.e. how many characters we have consumed. So if we end in the
4491 * middle of a shift sequence we have to back off the read position
4492 * and the output to the beginning of the sequence, otherwise we lose
4493 * all the shift state (seen bits, number of bits seen, high
4494 * surrogate). */
4495
Alexander Belopolsky40018472011-02-26 01:02:56 +00004496PyObject *
4497PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004498 Py_ssize_t size,
4499 const char *errors,
4500 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004501{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004502 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004503 Py_ssize_t startinpos;
4504 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004505 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004506 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004507 const char *errmsg = "";
4508 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004509 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510 unsigned int base64bits = 0;
4511 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004512 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513 PyObject *errorHandler = NULL;
4514 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004515
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004516 if (size == 0) {
4517 if (consumed)
4518 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004519 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004520 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004522 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004523 _PyUnicodeWriter_Init(&writer);
4524 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004525
4526 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004527 e = s + size;
4528
4529 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004530 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004532 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004533
Antoine Pitrou244651a2009-05-04 18:56:13 +00004534 if (inShift) { /* in a base-64 section */
4535 if (IS_BASE64(ch)) { /* consume a base-64 character */
4536 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4537 base64bits += 6;
4538 s++;
4539 if (base64bits >= 16) {
4540 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004541 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 base64bits -= 16;
4543 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004544 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 if (surrogate) {
4546 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004547 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4548 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004549 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004550 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004552 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 }
4554 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004555 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004556 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004557 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004558 }
4559 }
Victor Stinner551ac952011-11-29 22:58:13 +01004560 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 /* first surrogate */
4562 surrogate = outCh;
4563 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004565 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004566 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 }
4568 }
4569 }
4570 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004571 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 if (base64bits > 0) { /* left-over bits */
4573 if (base64bits >= 6) {
4574 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004575 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 errmsg = "partial character in shift sequence";
4577 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004578 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579 else {
4580 /* Some bits remain; they should be zero */
4581 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004582 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 errmsg = "non-zero padding bits in shift sequence";
4584 goto utf7Error;
4585 }
4586 }
4587 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004588 if (surrogate && DECODE_DIRECT(ch)) {
4589 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4590 goto onError;
4591 }
4592 surrogate = 0;
4593 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004594 /* '-' is absorbed; other terminating
4595 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004596 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004597 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598 }
4599 }
4600 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004601 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004602 s++; /* consume '+' */
4603 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004604 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004605 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004606 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004607 }
4608 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004609 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004610 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004611 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004613 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004614 }
4615 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004617 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004618 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004619 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004620 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004621 else {
4622 startinpos = s-starts;
4623 s++;
4624 errmsg = "unexpected special character";
4625 goto utf7Error;
4626 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004627 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004628utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004630 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004631 errors, &errorHandler,
4632 "utf7", errmsg,
4633 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004634 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004635 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004636 }
4637
Antoine Pitrou244651a2009-05-04 18:56:13 +00004638 /* end of string */
4639
4640 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4641 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004642 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004643 if (surrogate ||
4644 (base64bits >= 6) ||
4645 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004646 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004647 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004648 errors, &errorHandler,
4649 "utf7", "unterminated shift sequence",
4650 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004651 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004652 goto onError;
4653 if (s < e)
4654 goto restart;
4655 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657
4658 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004659 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004660 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004661 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004662 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004663 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004664 writer.kind, writer.data, shiftOutStart);
4665 Py_XDECREF(errorHandler);
4666 Py_XDECREF(exc);
4667 _PyUnicodeWriter_Dealloc(&writer);
4668 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004669 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004670 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004671 }
4672 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004673 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004675 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004676
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004677 Py_XDECREF(errorHandler);
4678 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004679 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004680
Benjamin Peterson29060642009-01-31 22:14:21 +00004681 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 Py_XDECREF(errorHandler);
4683 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004684 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004685 return NULL;
4686}
4687
4688
Alexander Belopolsky40018472011-02-26 01:02:56 +00004689PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004690_PyUnicode_EncodeUTF7(PyObject *str,
4691 int base64SetO,
4692 int base64WhiteSpace,
4693 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004694{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004695 int kind;
4696 void *data;
4697 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004698 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004699 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004700 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004701 unsigned int base64bits = 0;
4702 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004703 char * out;
4704 char * start;
4705
Benjamin Petersonbac79492012-01-14 13:34:47 -05004706 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004707 return NULL;
4708 kind = PyUnicode_KIND(str);
4709 data = PyUnicode_DATA(str);
4710 len = PyUnicode_GET_LENGTH(str);
4711
4712 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004713 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004714
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004715 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004716 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004717 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004718 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004719 if (v == NULL)
4720 return NULL;
4721
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004722 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004723 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004724 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004725
Antoine Pitrou244651a2009-05-04 18:56:13 +00004726 if (inShift) {
4727 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4728 /* shifting out */
4729 if (base64bits) { /* output remaining bits */
4730 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4731 base64buffer = 0;
4732 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004733 }
4734 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004735 /* Characters not in the BASE64 set implicitly unshift the sequence
4736 so no '-' is required, except if the character is itself a '-' */
4737 if (IS_BASE64(ch) || ch == '-') {
4738 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004739 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004740 *out++ = (char) ch;
4741 }
4742 else {
4743 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004744 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004745 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004746 else { /* not in a shift sequence */
4747 if (ch == '+') {
4748 *out++ = '+';
4749 *out++ = '-';
4750 }
4751 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4752 *out++ = (char) ch;
4753 }
4754 else {
4755 *out++ = '+';
4756 inShift = 1;
4757 goto encode_char;
4758 }
4759 }
4760 continue;
4761encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004762 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004763 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004764
Antoine Pitrou244651a2009-05-04 18:56:13 +00004765 /* code first surrogate */
4766 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004767 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004768 while (base64bits >= 6) {
4769 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4770 base64bits -= 6;
4771 }
4772 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004773 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004774 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004775 base64bits += 16;
4776 base64buffer = (base64buffer << 16) | ch;
4777 while (base64bits >= 6) {
4778 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4779 base64bits -= 6;
4780 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004781 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004782 if (base64bits)
4783 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4784 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004785 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004786 if (_PyBytes_Resize(&v, out - start) < 0)
4787 return NULL;
4788 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004789}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004790PyObject *
4791PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4792 Py_ssize_t size,
4793 int base64SetO,
4794 int base64WhiteSpace,
4795 const char *errors)
4796{
4797 PyObject *result;
4798 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4799 if (tmp == NULL)
4800 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004801 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004802 base64WhiteSpace, errors);
4803 Py_DECREF(tmp);
4804 return result;
4805}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004806
Antoine Pitrou244651a2009-05-04 18:56:13 +00004807#undef IS_BASE64
4808#undef FROM_BASE64
4809#undef TO_BASE64
4810#undef DECODE_DIRECT
4811#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004812
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813/* --- UTF-8 Codec -------------------------------------------------------- */
4814
Alexander Belopolsky40018472011-02-26 01:02:56 +00004815PyObject *
4816PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004817 Py_ssize_t size,
4818 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819{
Walter Dörwald69652032004-09-07 20:24:22 +00004820 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4821}
4822
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004823#include "stringlib/asciilib.h"
4824#include "stringlib/codecs.h"
4825#include "stringlib/undef.h"
4826
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004827#include "stringlib/ucs1lib.h"
4828#include "stringlib/codecs.h"
4829#include "stringlib/undef.h"
4830
4831#include "stringlib/ucs2lib.h"
4832#include "stringlib/codecs.h"
4833#include "stringlib/undef.h"
4834
4835#include "stringlib/ucs4lib.h"
4836#include "stringlib/codecs.h"
4837#include "stringlib/undef.h"
4838
Antoine Pitrouab868312009-01-10 15:40:25 +00004839/* Mask to quickly check whether a C 'long' contains a
4840 non-ASCII, UTF8-encoded char. */
4841#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004842# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004843#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004844# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004845#else
4846# error C 'long' size should be either 4 or 8!
4847#endif
4848
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849static Py_ssize_t
4850ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004851{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004852 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004853 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004854
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004855 /*
4856 * Issue #17237: m68k is a bit different from most architectures in
4857 * that objects do not use "natural alignment" - for example, int and
4858 * long are only aligned at 2-byte boundaries. Therefore the assert()
4859 * won't work; also, tests have shown that skipping the "optimised
4860 * version" will even speed up m68k.
4861 */
4862#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004863#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004864 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4865 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004866 /* Fast path, see in STRINGLIB(utf8_decode) for
4867 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004868 /* Help allocation */
4869 const char *_p = p;
4870 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004871 while (_p < aligned_end) {
4872 unsigned long value = *(const unsigned long *) _p;
4873 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004874 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004875 *((unsigned long *)q) = value;
4876 _p += SIZEOF_LONG;
4877 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004878 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004879 p = _p;
4880 while (p < end) {
4881 if ((unsigned char)*p & 0x80)
4882 break;
4883 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004885 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004888#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004889 while (p < end) {
4890 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4891 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004892 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004893 /* Help allocation */
4894 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004895 while (_p < aligned_end) {
4896 unsigned long value = *(unsigned long *) _p;
4897 if (value & ASCII_CHAR_MASK)
4898 break;
4899 _p += SIZEOF_LONG;
4900 }
4901 p = _p;
4902 if (_p == end)
4903 break;
4904 }
4905 if ((unsigned char)*p & 0x80)
4906 break;
4907 ++p;
4908 }
4909 memcpy(dest, start, p - start);
4910 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911}
Antoine Pitrouab868312009-01-10 15:40:25 +00004912
Victor Stinner785938e2011-12-11 20:09:03 +01004913PyObject *
4914PyUnicode_DecodeUTF8Stateful(const char *s,
4915 Py_ssize_t size,
4916 const char *errors,
4917 Py_ssize_t *consumed)
4918{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004919 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004920 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004921 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004922
4923 Py_ssize_t startinpos;
4924 Py_ssize_t endinpos;
4925 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004926 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004927 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004928 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004929
4930 if (size == 0) {
4931 if (consumed)
4932 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004933 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004934 }
4935
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004936 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4937 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004938 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004939 *consumed = 1;
4940 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004941 }
4942
Victor Stinner8f674cc2013-04-17 23:02:17 +02004943 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004944 writer.min_length = size;
4945 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004946 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004947
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004948 writer.pos = ascii_decode(s, end, writer.data);
4949 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004950 while (s < end) {
4951 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004952 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004953
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004954 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004955 if (PyUnicode_IS_ASCII(writer.buffer))
4956 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004957 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004958 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004959 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004960 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961 } else {
4962 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004963 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004964 }
4965
4966 switch (ch) {
4967 case 0:
4968 if (s == end || consumed)
4969 goto End;
4970 errmsg = "unexpected end of data";
4971 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004972 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004973 break;
4974 case 1:
4975 errmsg = "invalid start byte";
4976 startinpos = s - starts;
4977 endinpos = startinpos + 1;
4978 break;
4979 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004980 case 3:
4981 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004982 errmsg = "invalid continuation byte";
4983 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004984 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004985 break;
4986 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004987 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004988 goto onError;
4989 continue;
4990 }
4991
Victor Stinner1d65d912015-10-05 13:43:50 +02004992 if (error_handler == _Py_ERROR_UNKNOWN)
4993 error_handler = get_error_handler(errors);
4994
4995 switch (error_handler) {
4996 case _Py_ERROR_IGNORE:
4997 s += (endinpos - startinpos);
4998 break;
4999
5000 case _Py_ERROR_REPLACE:
5001 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5002 goto onError;
5003 s += (endinpos - startinpos);
5004 break;
5005
5006 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005007 {
5008 Py_ssize_t i;
5009
Victor Stinner1d65d912015-10-05 13:43:50 +02005010 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5011 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005012 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005013 ch = (Py_UCS4)(unsigned char)(starts[i]);
5014 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5015 ch + 0xdc00);
5016 writer.pos++;
5017 }
5018 s += (endinpos - startinpos);
5019 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005020 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005021
5022 default:
5023 if (unicode_decode_call_errorhandler_writer(
5024 errors, &error_handler_obj,
5025 "utf-8", errmsg,
5026 &starts, &end, &startinpos, &endinpos, &exc, &s,
5027 &writer))
5028 goto onError;
5029 }
Victor Stinner785938e2011-12-11 20:09:03 +01005030 }
5031
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005032End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005033 if (consumed)
5034 *consumed = s - starts;
5035
Victor Stinner1d65d912015-10-05 13:43:50 +02005036 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005037 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005038 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005039
5040onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005041 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005042 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005043 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005044 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005045}
5046
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005047#ifdef __APPLE__
5048
5049/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005050 used to decode the command line arguments on Mac OS X.
5051
5052 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005053 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005054
5055wchar_t*
5056_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5057{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005058 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005059 wchar_t *unicode;
5060 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005061
5062 /* Note: size will always be longer than the resulting Unicode
5063 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005064 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005065 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005066 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005067 if (!unicode)
5068 return NULL;
5069
5070 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005071 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005073 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005074 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005075#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005076 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005077#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005078 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005079#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005080 if (ch > 0xFF) {
5081#if SIZEOF_WCHAR_T == 4
5082 assert(0);
5083#else
5084 assert(Py_UNICODE_IS_SURROGATE(ch));
5085 /* compute and append the two surrogates: */
5086 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5087 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5088#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005089 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005090 else {
5091 if (!ch && s == e)
5092 break;
5093 /* surrogateescape */
5094 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5095 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005096 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005097 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005098 return unicode;
5099}
5100
5101#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005103/* Primary internal function which creates utf8 encoded bytes objects.
5104
5105 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005106 and allocate exactly as much space needed at the end. Else allocate the
5107 maximum possible needed (4 result bytes per Unicode character), and return
5108 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005109*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005110PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005111_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112{
Victor Stinner6099a032011-12-18 14:22:26 +01005113 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005114 void *data;
5115 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005117 if (!PyUnicode_Check(unicode)) {
5118 PyErr_BadArgument();
5119 return NULL;
5120 }
5121
5122 if (PyUnicode_READY(unicode) == -1)
5123 return NULL;
5124
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005125 if (PyUnicode_UTF8(unicode))
5126 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5127 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005128
5129 kind = PyUnicode_KIND(unicode);
5130 data = PyUnicode_DATA(unicode);
5131 size = PyUnicode_GET_LENGTH(unicode);
5132
Benjamin Petersonead6b532011-12-20 17:23:42 -06005133 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005134 default:
5135 assert(0);
5136 case PyUnicode_1BYTE_KIND:
5137 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5138 assert(!PyUnicode_IS_ASCII(unicode));
5139 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5140 case PyUnicode_2BYTE_KIND:
5141 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5142 case PyUnicode_4BYTE_KIND:
5143 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005144 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145}
5146
Alexander Belopolsky40018472011-02-26 01:02:56 +00005147PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005148PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5149 Py_ssize_t size,
5150 const char *errors)
5151{
5152 PyObject *v, *unicode;
5153
5154 unicode = PyUnicode_FromUnicode(s, size);
5155 if (unicode == NULL)
5156 return NULL;
5157 v = _PyUnicode_AsUTF8String(unicode, errors);
5158 Py_DECREF(unicode);
5159 return v;
5160}
5161
5162PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005163PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005165 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166}
5167
Walter Dörwald41980ca2007-08-16 21:55:45 +00005168/* --- UTF-32 Codec ------------------------------------------------------- */
5169
5170PyObject *
5171PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005172 Py_ssize_t size,
5173 const char *errors,
5174 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005175{
5176 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5177}
5178
5179PyObject *
5180PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 Py_ssize_t size,
5182 const char *errors,
5183 int *byteorder,
5184 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005185{
5186 const char *starts = s;
5187 Py_ssize_t startinpos;
5188 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005189 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005190 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005191 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005192 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005193 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005194 PyObject *errorHandler = NULL;
5195 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005196
Walter Dörwald41980ca2007-08-16 21:55:45 +00005197 q = (unsigned char *)s;
5198 e = q + size;
5199
5200 if (byteorder)
5201 bo = *byteorder;
5202
5203 /* Check for BOM marks (U+FEFF) in the input and adjust current
5204 byte order setting accordingly. In native mode, the leading BOM
5205 mark is skipped, in all other modes, it is copied to the output
5206 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005207 if (bo == 0 && size >= 4) {
5208 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5209 if (bom == 0x0000FEFF) {
5210 bo = -1;
5211 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005212 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005213 else if (bom == 0xFFFE0000) {
5214 bo = 1;
5215 q += 4;
5216 }
5217 if (byteorder)
5218 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005219 }
5220
Victor Stinnere64322e2012-10-30 23:12:47 +01005221 if (q == e) {
5222 if (consumed)
5223 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005224 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005225 }
5226
Victor Stinnere64322e2012-10-30 23:12:47 +01005227#ifdef WORDS_BIGENDIAN
5228 le = bo < 0;
5229#else
5230 le = bo <= 0;
5231#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005232 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005233
Victor Stinner8f674cc2013-04-17 23:02:17 +02005234 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005235 writer.min_length = (e - q + 3) / 4;
5236 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005237 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005238
Victor Stinnere64322e2012-10-30 23:12:47 +01005239 while (1) {
5240 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005241 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005242
Victor Stinnere64322e2012-10-30 23:12:47 +01005243 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005244 enum PyUnicode_Kind kind = writer.kind;
5245 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005246 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005247 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005248 if (le) {
5249 do {
5250 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5251 if (ch > maxch)
5252 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005253 if (kind != PyUnicode_1BYTE_KIND &&
5254 Py_UNICODE_IS_SURROGATE(ch))
5255 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005256 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005257 q += 4;
5258 } while (q <= last);
5259 }
5260 else {
5261 do {
5262 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5263 if (ch > maxch)
5264 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005265 if (kind != PyUnicode_1BYTE_KIND &&
5266 Py_UNICODE_IS_SURROGATE(ch))
5267 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005268 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005269 q += 4;
5270 } while (q <= last);
5271 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005272 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005273 }
5274
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005275 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005276 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005277 startinpos = ((const char *)q) - starts;
5278 endinpos = startinpos + 4;
5279 }
5280 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005281 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005282 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005283 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005285 startinpos = ((const char *)q) - starts;
5286 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005287 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005288 else {
5289 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005290 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005291 goto onError;
5292 q += 4;
5293 continue;
5294 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005295 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005296 startinpos = ((const char *)q) - starts;
5297 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005298 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005299
5300 /* The remaining input chars are ignored if the callback
5301 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005302 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005303 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005304 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005305 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005306 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005308 }
5309
Walter Dörwald41980ca2007-08-16 21:55:45 +00005310 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005312
Walter Dörwald41980ca2007-08-16 21:55:45 +00005313 Py_XDECREF(errorHandler);
5314 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005315 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005316
Benjamin Peterson29060642009-01-31 22:14:21 +00005317 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005318 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005319 Py_XDECREF(errorHandler);
5320 Py_XDECREF(exc);
5321 return NULL;
5322}
5323
5324PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005325_PyUnicode_EncodeUTF32(PyObject *str,
5326 const char *errors,
5327 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005328{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005329 enum PyUnicode_Kind kind;
5330 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005331 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005332 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005333 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005334#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005335 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005336#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005337 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005338#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005339 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005340 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005341 PyObject *errorHandler = NULL;
5342 PyObject *exc = NULL;
5343 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005344
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005345 if (!PyUnicode_Check(str)) {
5346 PyErr_BadArgument();
5347 return NULL;
5348 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005349 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005350 return NULL;
5351 kind = PyUnicode_KIND(str);
5352 data = PyUnicode_DATA(str);
5353 len = PyUnicode_GET_LENGTH(str);
5354
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005355 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005356 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005357 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005358 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005359 if (v == NULL)
5360 return NULL;
5361
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005362 /* output buffer is 4-bytes aligned */
5363 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005364 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005365 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005366 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005367 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005368 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005369
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005370 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005371 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005372 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005373 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005374 else
5375 encoding = "utf-32";
5376
5377 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005378 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5379 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005380 }
5381
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005382 pos = 0;
5383 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005384 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005385
5386 if (kind == PyUnicode_2BYTE_KIND) {
5387 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5388 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005389 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005390 else {
5391 assert(kind == PyUnicode_4BYTE_KIND);
5392 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5393 &out, native_ordering);
5394 }
5395 if (pos == len)
5396 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005397
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005398 rep = unicode_encode_call_errorhandler(
5399 errors, &errorHandler,
5400 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005401 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005402 if (!rep)
5403 goto error;
5404
5405 if (PyBytes_Check(rep)) {
5406 repsize = PyBytes_GET_SIZE(rep);
5407 if (repsize & 3) {
5408 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005409 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005410 "surrogates not allowed");
5411 goto error;
5412 }
5413 moreunits = repsize / 4;
5414 }
5415 else {
5416 assert(PyUnicode_Check(rep));
5417 if (PyUnicode_READY(rep) < 0)
5418 goto error;
5419 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5420 if (!PyUnicode_IS_ASCII(rep)) {
5421 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005422 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005423 "surrogates not allowed");
5424 goto error;
5425 }
5426 }
5427
5428 /* four bytes are reserved for each surrogate */
5429 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005430 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005431 Py_ssize_t morebytes = 4 * (moreunits - 1);
5432 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5433 /* integer overflow */
5434 PyErr_NoMemory();
5435 goto error;
5436 }
5437 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5438 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005439 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005440 }
5441
5442 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005443 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5444 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005445 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005446 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005447 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5448 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005449 }
5450
5451 Py_CLEAR(rep);
5452 }
5453
5454 /* Cut back to size actually needed. This is necessary for, for example,
5455 encoding of a string containing isolated surrogates and the 'ignore'
5456 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005457 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005458 if (nsize != PyBytes_GET_SIZE(v))
5459 _PyBytes_Resize(&v, nsize);
5460 Py_XDECREF(errorHandler);
5461 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005462 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005463 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005464 error:
5465 Py_XDECREF(rep);
5466 Py_XDECREF(errorHandler);
5467 Py_XDECREF(exc);
5468 Py_XDECREF(v);
5469 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005470}
5471
Alexander Belopolsky40018472011-02-26 01:02:56 +00005472PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005473PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5474 Py_ssize_t size,
5475 const char *errors,
5476 int byteorder)
5477{
5478 PyObject *result;
5479 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5480 if (tmp == NULL)
5481 return NULL;
5482 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5483 Py_DECREF(tmp);
5484 return result;
5485}
5486
5487PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005488PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005489{
Victor Stinnerb960b342011-11-20 19:12:52 +01005490 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005491}
5492
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493/* --- UTF-16 Codec ------------------------------------------------------- */
5494
Tim Peters772747b2001-08-09 22:21:55 +00005495PyObject *
5496PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 Py_ssize_t size,
5498 const char *errors,
5499 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500{
Walter Dörwald69652032004-09-07 20:24:22 +00005501 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5502}
5503
5504PyObject *
5505PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 Py_ssize_t size,
5507 const char *errors,
5508 int *byteorder,
5509 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005510{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005511 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005512 Py_ssize_t startinpos;
5513 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005514 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005515 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005516 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005517 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005518 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005519 PyObject *errorHandler = NULL;
5520 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005521 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522
Tim Peters772747b2001-08-09 22:21:55 +00005523 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005524 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525
5526 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005527 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005529 /* Check for BOM marks (U+FEFF) in the input and adjust current
5530 byte order setting accordingly. In native mode, the leading BOM
5531 mark is skipped, in all other modes, it is copied to the output
5532 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005533 if (bo == 0 && size >= 2) {
5534 const Py_UCS4 bom = (q[1] << 8) | q[0];
5535 if (bom == 0xFEFF) {
5536 q += 2;
5537 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005539 else if (bom == 0xFFFE) {
5540 q += 2;
5541 bo = 1;
5542 }
5543 if (byteorder)
5544 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005545 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546
Antoine Pitrou63065d72012-05-15 23:48:04 +02005547 if (q == e) {
5548 if (consumed)
5549 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005550 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005551 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005552
Christian Heimes743e0cd2012-10-17 23:52:17 +02005553#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005554 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005555 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005556#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005557 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005558 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005559#endif
Tim Peters772747b2001-08-09 22:21:55 +00005560
Antoine Pitrou63065d72012-05-15 23:48:04 +02005561 /* Note: size will always be longer than the resulting Unicode
5562 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005563 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005564 writer.min_length = (e - q + 1) / 2;
5565 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005566 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005567
Antoine Pitrou63065d72012-05-15 23:48:04 +02005568 while (1) {
5569 Py_UCS4 ch = 0;
5570 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005571 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005572 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005573 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005574 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005575 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005576 native_ordering);
5577 else
5578 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005579 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005580 native_ordering);
5581 } else if (kind == PyUnicode_2BYTE_KIND) {
5582 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005583 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005584 native_ordering);
5585 } else {
5586 assert(kind == PyUnicode_4BYTE_KIND);
5587 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005588 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005589 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005590 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005591 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005592
Antoine Pitrou63065d72012-05-15 23:48:04 +02005593 switch (ch)
5594 {
5595 case 0:
5596 /* remaining byte at the end? (size should be even) */
5597 if (q == e || consumed)
5598 goto End;
5599 errmsg = "truncated data";
5600 startinpos = ((const char *)q) - starts;
5601 endinpos = ((const char *)e) - starts;
5602 break;
5603 /* The remaining input chars are ignored if the callback
5604 chooses to skip the input */
5605 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005606 q -= 2;
5607 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005608 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005609 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005610 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611 endinpos = ((const char *)e) - starts;
5612 break;
5613 case 2:
5614 errmsg = "illegal encoding";
5615 startinpos = ((const char *)q) - 2 - starts;
5616 endinpos = startinpos + 2;
5617 break;
5618 case 3:
5619 errmsg = "illegal UTF-16 surrogate";
5620 startinpos = ((const char *)q) - 4 - starts;
5621 endinpos = startinpos + 2;
5622 break;
5623 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005624 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005625 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 continue;
5627 }
5628
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005629 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005630 errors,
5631 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005632 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005633 &starts,
5634 (const char **)&e,
5635 &startinpos,
5636 &endinpos,
5637 &exc,
5638 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005639 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005640 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 }
5642
Antoine Pitrou63065d72012-05-15 23:48:04 +02005643End:
Walter Dörwald69652032004-09-07 20:24:22 +00005644 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005645 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005646
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005647 Py_XDECREF(errorHandler);
5648 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005649 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650
Benjamin Peterson29060642009-01-31 22:14:21 +00005651 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005652 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 Py_XDECREF(errorHandler);
5654 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 return NULL;
5656}
5657
Tim Peters772747b2001-08-09 22:21:55 +00005658PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005659_PyUnicode_EncodeUTF16(PyObject *str,
5660 const char *errors,
5661 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005663 enum PyUnicode_Kind kind;
5664 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005665 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005666 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005667 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005668 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005669#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005670 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005671#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005672 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005673#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005674 const char *encoding;
5675 Py_ssize_t nsize, pos;
5676 PyObject *errorHandler = NULL;
5677 PyObject *exc = NULL;
5678 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005679
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005680 if (!PyUnicode_Check(str)) {
5681 PyErr_BadArgument();
5682 return NULL;
5683 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005684 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005685 return NULL;
5686 kind = PyUnicode_KIND(str);
5687 data = PyUnicode_DATA(str);
5688 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005689
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005690 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005691 if (kind == PyUnicode_4BYTE_KIND) {
5692 const Py_UCS4 *in = (const Py_UCS4 *)data;
5693 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005694 while (in < end) {
5695 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005696 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005697 }
5698 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005699 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005700 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005702 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005703 nsize = len + pairs + (byteorder == 0);
5704 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005705 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005709 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005710 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005711 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005712 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005713 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005714 }
5715 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005716 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005717 }
Tim Peters772747b2001-08-09 22:21:55 +00005718
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005719 if (kind == PyUnicode_1BYTE_KIND) {
5720 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5721 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005722 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005723
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005724 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005725 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005726 }
5727 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005728 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005729 }
5730 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005731 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005732 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005733
5734 pos = 0;
5735 while (pos < len) {
5736 Py_ssize_t repsize, moreunits;
5737
5738 if (kind == PyUnicode_2BYTE_KIND) {
5739 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5740 &out, native_ordering);
5741 }
5742 else {
5743 assert(kind == PyUnicode_4BYTE_KIND);
5744 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5745 &out, native_ordering);
5746 }
5747 if (pos == len)
5748 break;
5749
5750 rep = unicode_encode_call_errorhandler(
5751 errors, &errorHandler,
5752 encoding, "surrogates not allowed",
5753 str, &exc, pos, pos + 1, &pos);
5754 if (!rep)
5755 goto error;
5756
5757 if (PyBytes_Check(rep)) {
5758 repsize = PyBytes_GET_SIZE(rep);
5759 if (repsize & 1) {
5760 raise_encode_exception(&exc, encoding,
5761 str, pos - 1, pos,
5762 "surrogates not allowed");
5763 goto error;
5764 }
5765 moreunits = repsize / 2;
5766 }
5767 else {
5768 assert(PyUnicode_Check(rep));
5769 if (PyUnicode_READY(rep) < 0)
5770 goto error;
5771 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5772 if (!PyUnicode_IS_ASCII(rep)) {
5773 raise_encode_exception(&exc, encoding,
5774 str, pos - 1, pos,
5775 "surrogates not allowed");
5776 goto error;
5777 }
5778 }
5779
5780 /* two bytes are reserved for each surrogate */
5781 if (moreunits > 1) {
5782 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5783 Py_ssize_t morebytes = 2 * (moreunits - 1);
5784 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5785 /* integer overflow */
5786 PyErr_NoMemory();
5787 goto error;
5788 }
5789 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5790 goto error;
5791 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5792 }
5793
5794 if (PyBytes_Check(rep)) {
5795 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5796 out += moreunits;
5797 } else /* rep is unicode */ {
5798 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5799 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5800 &out, native_ordering);
5801 }
5802
5803 Py_CLEAR(rep);
5804 }
5805
5806 /* Cut back to size actually needed. This is necessary for, for example,
5807 encoding of a string containing isolated surrogates and the 'ignore' handler
5808 is used. */
5809 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5810 if (nsize != PyBytes_GET_SIZE(v))
5811 _PyBytes_Resize(&v, nsize);
5812 Py_XDECREF(errorHandler);
5813 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005814 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005815 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005816 error:
5817 Py_XDECREF(rep);
5818 Py_XDECREF(errorHandler);
5819 Py_XDECREF(exc);
5820 Py_XDECREF(v);
5821 return NULL;
5822#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823}
5824
Alexander Belopolsky40018472011-02-26 01:02:56 +00005825PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005826PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5827 Py_ssize_t size,
5828 const char *errors,
5829 int byteorder)
5830{
5831 PyObject *result;
5832 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5833 if (tmp == NULL)
5834 return NULL;
5835 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5836 Py_DECREF(tmp);
5837 return result;
5838}
5839
5840PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005841PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005843 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844}
5845
5846/* --- Unicode Escape Codec ----------------------------------------------- */
5847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005848/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5849 if all the escapes in the string make it still a valid ASCII string.
5850 Returns -1 if any escapes were found which cause the string to
5851 pop out of ASCII range. Otherwise returns the length of the
5852 required buffer to hold the string.
5853 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005854static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005855length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5856{
5857 const unsigned char *p = (const unsigned char *)s;
5858 const unsigned char *end = p + size;
5859 Py_ssize_t length = 0;
5860
5861 if (size < 0)
5862 return -1;
5863
5864 for (; p < end; ++p) {
5865 if (*p > 127) {
5866 /* Non-ASCII */
5867 return -1;
5868 }
5869 else if (*p != '\\') {
5870 /* Normal character */
5871 ++length;
5872 }
5873 else {
5874 /* Backslash-escape, check next char */
5875 ++p;
5876 /* Escape sequence reaches till end of string or
5877 non-ASCII follow-up. */
5878 if (p >= end || *p > 127)
5879 return -1;
5880 switch (*p) {
5881 case '\n':
5882 /* backslash + \n result in zero characters */
5883 break;
5884 case '\\': case '\'': case '\"':
5885 case 'b': case 'f': case 't':
5886 case 'n': case 'r': case 'v': case 'a':
5887 ++length;
5888 break;
5889 case '0': case '1': case '2': case '3':
5890 case '4': case '5': case '6': case '7':
5891 case 'x': case 'u': case 'U': case 'N':
5892 /* these do not guarantee ASCII characters */
5893 return -1;
5894 default:
5895 /* count the backslash + the other character */
5896 length += 2;
5897 }
5898 }
5899 }
5900 return length;
5901}
5902
Fredrik Lundh06d12682001-01-24 07:59:11 +00005903static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005904
Alexander Belopolsky40018472011-02-26 01:02:56 +00005905PyObject *
5906PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005907 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005908 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005910 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005911 Py_ssize_t startinpos;
5912 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005913 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005915 char* message;
5916 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005917 PyObject *errorHandler = NULL;
5918 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005919 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005920
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005921 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005922 if (len == 0)
5923 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005924
5925 /* After length_of_escaped_ascii_string() there are two alternatives,
5926 either the string is pure ASCII with named escapes like \n, etc.
5927 and we determined it's exact size (common case)
5928 or it contains \x, \u, ... escape sequences. then we create a
5929 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005930 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005931 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005932 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005933 }
5934 else {
5935 /* Escaped strings will always be longer than the resulting
5936 Unicode string, so we start with size here and then reduce the
5937 length after conversion to the true value.
5938 (but if the error callback returns a long replacement string
5939 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005940 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005941 }
5942
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005944 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005946
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 while (s < end) {
5948 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005949 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005950 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951
5952 /* Non-escape characters are interpreted as Unicode ordinals */
5953 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005954 x = (unsigned char)*s;
5955 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005956 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005957 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 continue;
5959 }
5960
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005961 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 /* \ - Escapes */
5963 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005964 c = *s++;
5965 if (s > end)
5966 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005967
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005968 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005971#define WRITECHAR(ch) \
5972 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005973 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005974 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005975 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005976
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005978 case '\\': WRITECHAR('\\'); break;
5979 case '\'': WRITECHAR('\''); break;
5980 case '\"': WRITECHAR('\"'); break;
5981 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005982 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005983 case 'f': WRITECHAR('\014'); break;
5984 case 't': WRITECHAR('\t'); break;
5985 case 'n': WRITECHAR('\n'); break;
5986 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005987 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005988 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005989 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005990 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 case '0': case '1': case '2': case '3':
5994 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005995 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005996 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005997 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005998 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005999 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006001 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 break;
6003
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 /* hex escapes */
6005 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006007 digits = 2;
6008 message = "truncated \\xXX escape";
6009 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006013 digits = 4;
6014 message = "truncated \\uXXXX escape";
6015 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006018 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006019 digits = 8;
6020 message = "truncated \\UXXXXXXXX escape";
6021 hexescape:
6022 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02006023 if (end - s < digits) {
6024 /* count only hex digits */
6025 for (; s < end; ++s) {
6026 c = (unsigned char)*s;
6027 if (!Py_ISXDIGIT(c))
6028 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006029 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006030 goto error;
6031 }
6032 for (; digits--; ++s) {
6033 c = (unsigned char)*s;
6034 if (!Py_ISXDIGIT(c))
6035 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006036 chr = (chr<<4) & ~0xF;
6037 if (c >= '0' && c <= '9')
6038 chr += c - '0';
6039 else if (c >= 'a' && c <= 'f')
6040 chr += 10 + c - 'a';
6041 else
6042 chr += 10 + c - 'A';
6043 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00006044 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006045 /* _decoding_error will have already written into the
6046 target buffer. */
6047 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006048 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00006049 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02006050 message = "illegal Unicode character";
6051 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02006052 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02006053 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006054 break;
6055
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006057 case 'N':
6058 message = "malformed \\N character escape";
6059 if (ucnhash_CAPI == NULL) {
6060 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006061 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6062 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006063 if (ucnhash_CAPI == NULL)
6064 goto ucnhashError;
6065 }
6066 if (*s == '{') {
6067 const char *start = s+1;
6068 /* look for the closing brace */
6069 while (*s != '}' && s < end)
6070 s++;
6071 if (s > start && s < end && *s == '}') {
6072 /* found a name. look it up in the unicode database */
6073 message = "unknown Unicode character name";
6074 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02006075 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02006076 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006077 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006078 goto store;
6079 }
6080 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006081 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006082
6083 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006084 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006085 message = "\\ at end of string";
6086 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02006087 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00006088 }
6089 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006090 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02006091 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006092 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006093 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006095 continue;
6096
6097 error:
6098 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006099 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006100 errors, &errorHandler,
6101 "unicodeescape", message,
6102 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006103 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02006104 goto onError;
6105 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006107#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006108
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006109 Py_XDECREF(errorHandler);
6110 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006111 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006112
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006114 PyErr_SetString(
6115 PyExc_UnicodeError,
6116 "\\N escapes not supported (can't load unicodedata module)"
6117 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006118 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006119 Py_XDECREF(errorHandler);
6120 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006121 return NULL;
6122
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006124 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006125 Py_XDECREF(errorHandler);
6126 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 return NULL;
6128}
6129
6130/* Return a Unicode-Escape string version of the Unicode object.
6131
6132 If quotes is true, the string is enclosed in u"" or u'' quotes as
6133 appropriate.
6134
6135*/
6136
Alexander Belopolsky40018472011-02-26 01:02:56 +00006137PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006138PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006140 Py_ssize_t i, len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006142 int kind;
6143 void *data;
Victor Stinner358af132015-10-12 22:36:57 +02006144 _PyBytesWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145
Ezio Melottie7f90372012-10-05 03:33:31 +03006146 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006147 escape.
6148
Ezio Melottie7f90372012-10-05 03:33:31 +03006149 For UCS1 strings it's '\xxx', 4 bytes per source character.
6150 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6151 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006152 */
6153
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154 if (!PyUnicode_Check(unicode)) {
6155 PyErr_BadArgument();
6156 return NULL;
6157 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006158 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159 return NULL;
Victor Stinner358af132015-10-12 22:36:57 +02006160
6161 _PyBytesWriter_Init(&writer);
6162
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006163 len = PyUnicode_GET_LENGTH(unicode);
6164 kind = PyUnicode_KIND(unicode);
6165 data = PyUnicode_DATA(unicode);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006166
Victor Stinner358af132015-10-12 22:36:57 +02006167 p = _PyBytesWriter_Alloc(&writer, len);
6168 if (p == NULL)
6169 goto error;
6170 writer.overallocate = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006173 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006174
Walter Dörwald79e913e2007-05-12 11:08:06 +00006175 /* Escape backslashes */
6176 if (ch == '\\') {
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006177 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006178 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6179 if (p == NULL)
6180 goto error;
6181
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 *p++ = '\\';
6183 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006184 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006185 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006186
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006187 /* Map 21-bit characters to '\U00xxxxxx' */
6188 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006189 assert(ch <= MAX_UNICODE);
Victor Stinner358af132015-10-12 22:36:57 +02006190
6191 p = _PyBytesWriter_Prepare(&writer, p, 10-1);
6192 if (p == NULL)
6193 goto error;
6194
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006195 *p++ = '\\';
6196 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006197 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6198 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6199 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6200 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6201 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6202 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6203 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6204 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006206 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006207
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006209 if (ch >= 256) {
Victor Stinner358af132015-10-12 22:36:57 +02006210 p = _PyBytesWriter_Prepare(&writer, p, 6-1);
6211 if (p == NULL)
6212 goto error;
6213
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 *p++ = '\\';
6215 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006216 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6217 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6218 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6219 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006221
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006222 /* Map special whitespace to '\t', \n', '\r' */
6223 else if (ch == '\t') {
Victor Stinner358af132015-10-12 22:36:57 +02006224 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6225 if (p == NULL)
6226 goto error;
6227
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006228 *p++ = '\\';
6229 *p++ = 't';
6230 }
6231 else if (ch == '\n') {
Victor Stinner358af132015-10-12 22:36:57 +02006232 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6233 if (p == NULL)
6234 goto error;
6235
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006236 *p++ = '\\';
6237 *p++ = 'n';
6238 }
6239 else if (ch == '\r') {
Victor Stinner358af132015-10-12 22:36:57 +02006240 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6241 if (p == NULL)
6242 goto error;
6243
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006244 *p++ = '\\';
6245 *p++ = 'r';
6246 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006247
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006248 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006249 else if (ch < ' ' || ch >= 0x7F) {
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006250 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006251 p = _PyBytesWriter_Prepare(&writer, p, 4-1);
6252 if (p == NULL)
6253 goto error;
6254
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006256 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006257 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6258 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006259 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006260
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 /* Copy everything else as-is */
6262 else
6263 *p++ = (char) ch;
6264 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265
Victor Stinner358af132015-10-12 22:36:57 +02006266 return _PyBytesWriter_Finish(&writer, p);
6267
6268error:
6269 _PyBytesWriter_Dealloc(&writer);
6270 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271}
6272
Alexander Belopolsky40018472011-02-26 01:02:56 +00006273PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006274PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6275 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006277 PyObject *result;
6278 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6279 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006281 result = PyUnicode_AsUnicodeEscapeString(tmp);
6282 Py_DECREF(tmp);
6283 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284}
6285
6286/* --- Raw Unicode Escape Codec ------------------------------------------- */
6287
Alexander Belopolsky40018472011-02-26 01:02:56 +00006288PyObject *
6289PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006290 Py_ssize_t size,
6291 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006293 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006294 Py_ssize_t startinpos;
6295 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006296 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 const char *end;
6298 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006299 PyObject *errorHandler = NULL;
6300 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006301
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006302 if (size == 0)
6303 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006304
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 /* Escaped strings will always be longer than the resulting
6306 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006307 length after conversion to the true value. (But decoding error
6308 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006309 _PyUnicodeWriter_Init(&writer);
6310 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006311
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 end = s + size;
6313 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006314 unsigned char c;
6315 Py_UCS4 x;
6316 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006317 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 /* Non-escape characters are interpreted as Unicode ordinals */
6320 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006321 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006322 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006323 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006325 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 startinpos = s-starts;
6327
6328 /* \u-escapes are only interpreted iff the number of leading
6329 backslashes if odd */
6330 bs = s;
6331 for (;s < end;) {
6332 if (*s != '\\')
6333 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006334 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006335 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006336 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 }
6338 if (((s - bs) & 1) == 0 ||
6339 s >= end ||
6340 (*s != 'u' && *s != 'U')) {
6341 continue;
6342 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006343 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 count = *s=='u' ? 4 : 8;
6345 s++;
6346
6347 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006348 for (x = 0, i = 0; i < count; ++i, ++s) {
6349 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006350 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006352 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 errors, &errorHandler,
6354 "rawunicodeescape", "truncated \\uXXXX",
6355 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006356 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 goto onError;
6358 goto nextByte;
6359 }
6360 x = (x<<4) & ~0xF;
6361 if (c >= '0' && c <= '9')
6362 x += c - '0';
6363 else if (c >= 'a' && c <= 'f')
6364 x += 10 + c - 'a';
6365 else
6366 x += 10 + c - 'A';
6367 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006368 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006369 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006370 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006371 }
6372 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006373 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006374 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006375 errors, &errorHandler,
6376 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006378 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006380 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 nextByte:
6382 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006384 Py_XDECREF(errorHandler);
6385 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006386 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006387
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006389 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006390 Py_XDECREF(errorHandler);
6391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392 return NULL;
6393}
6394
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006395
Alexander Belopolsky40018472011-02-26 01:02:56 +00006396PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006397PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 char *p;
Victor Stinner358af132015-10-12 22:36:57 +02006400 Py_ssize_t pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006401 int kind;
6402 void *data;
6403 Py_ssize_t len;
Victor Stinner358af132015-10-12 22:36:57 +02006404 _PyBytesWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006406 if (!PyUnicode_Check(unicode)) {
6407 PyErr_BadArgument();
6408 return NULL;
6409 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006410 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411 return NULL;
Victor Stinner358af132015-10-12 22:36:57 +02006412
6413 _PyBytesWriter_Init(&writer);
6414
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006415 kind = PyUnicode_KIND(unicode);
6416 data = PyUnicode_DATA(unicode);
6417 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner0e368262011-11-10 20:12:49 +01006418
Victor Stinner358af132015-10-12 22:36:57 +02006419 p = _PyBytesWriter_Alloc(&writer, len);
6420 if (p == NULL)
6421 goto error;
6422 writer.overallocate = 1;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006423
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006424 for (pos = 0; pos < len; pos++) {
6425 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 /* Map 32-bit characters to '\Uxxxxxxxx' */
6427 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006428 assert(ch <= MAX_UNICODE);
Victor Stinner358af132015-10-12 22:36:57 +02006429
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006430 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006431 p = _PyBytesWriter_Prepare(&writer, p, 10-1);
6432 if (p == NULL)
6433 goto error;
6434
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006435 *p++ = '\\';
6436 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006437 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6438 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6439 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6440 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6441 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6442 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6443 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6444 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006445 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006447 else if (ch >= 256) {
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006448 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006449 p = _PyBytesWriter_Prepare(&writer, p, 6-1);
6450 if (p == NULL)
6451 goto error;
6452
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 *p++ = '\\';
6454 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006455 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6456 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6457 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6458 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 /* Copy everything else as-is */
6461 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 *p++ = (char) ch;
6463 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006464
Victor Stinner358af132015-10-12 22:36:57 +02006465 return _PyBytesWriter_Finish(&writer, p);
6466
6467error:
6468 _PyBytesWriter_Dealloc(&writer);
6469 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470}
6471
Alexander Belopolsky40018472011-02-26 01:02:56 +00006472PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006473PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6474 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006476 PyObject *result;
6477 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6478 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006479 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006480 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6481 Py_DECREF(tmp);
6482 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483}
6484
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006485/* --- Unicode Internal Codec ------------------------------------------- */
6486
Alexander Belopolsky40018472011-02-26 01:02:56 +00006487PyObject *
6488_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006489 Py_ssize_t size,
6490 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006491{
6492 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006493 Py_ssize_t startinpos;
6494 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006495 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006496 const char *end;
6497 const char *reason;
6498 PyObject *errorHandler = NULL;
6499 PyObject *exc = NULL;
6500
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006501 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006502 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006503 1))
6504 return NULL;
6505
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006506 if (size == 0)
6507 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006508
Victor Stinner8f674cc2013-04-17 23:02:17 +02006509 _PyUnicodeWriter_Init(&writer);
6510 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6511 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006513 }
6514 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006515
Victor Stinner8f674cc2013-04-17 23:02:17 +02006516 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006517 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006518 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006519 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006520 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006521 endinpos = end-starts;
6522 reason = "truncated input";
6523 goto error;
6524 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006525 /* We copy the raw representation one byte at a time because the
6526 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006527 ((char *) &uch)[0] = s[0];
6528 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006529#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006530 ((char *) &uch)[2] = s[2];
6531 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006532#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006533 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006534#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006535 /* We have to sanity check the raw data, otherwise doom looms for
6536 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006537 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006538 endinpos = s - starts + Py_UNICODE_SIZE;
6539 reason = "illegal code point (> 0x10FFFF)";
6540 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006541 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006542#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006543 s += Py_UNICODE_SIZE;
6544#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006545 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006546 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006547 Py_UNICODE uch2;
6548 ((char *) &uch2)[0] = s[0];
6549 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006550 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006551 {
Victor Stinner551ac952011-11-29 22:58:13 +01006552 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006553 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006554 }
6555 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006556#endif
6557
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006558 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006559 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006560 continue;
6561
6562 error:
6563 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006564 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006565 errors, &errorHandler,
6566 "unicode_internal", reason,
6567 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006568 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006569 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006570 }
6571
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006572 Py_XDECREF(errorHandler);
6573 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006574 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006575
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006577 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006578 Py_XDECREF(errorHandler);
6579 Py_XDECREF(exc);
6580 return NULL;
6581}
6582
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583/* --- Latin-1 Codec ------------------------------------------------------ */
6584
Alexander Belopolsky40018472011-02-26 01:02:56 +00006585PyObject *
6586PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006587 Py_ssize_t size,
6588 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006591 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592}
6593
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006594/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006595static void
6596make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006597 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006598 PyObject *unicode,
6599 Py_ssize_t startpos, Py_ssize_t endpos,
6600 const char *reason)
6601{
6602 if (*exceptionObject == NULL) {
6603 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006604 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006605 encoding, unicode, startpos, endpos, reason);
6606 }
6607 else {
6608 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6609 goto onError;
6610 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6611 goto onError;
6612 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6613 goto onError;
6614 return;
6615 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006616 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006617 }
6618}
6619
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006620/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006621static void
6622raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006623 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006624 PyObject *unicode,
6625 Py_ssize_t startpos, Py_ssize_t endpos,
6626 const char *reason)
6627{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006628 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006629 encoding, unicode, startpos, endpos, reason);
6630 if (*exceptionObject != NULL)
6631 PyCodec_StrictErrors(*exceptionObject);
6632}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006633
6634/* error handling callback helper:
6635 build arguments, call the callback and check the arguments,
6636 put the result into newpos and return the replacement string, which
6637 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006638static PyObject *
6639unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006640 PyObject **errorHandler,
6641 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006642 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006643 Py_ssize_t startpos, Py_ssize_t endpos,
6644 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006645{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006646 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006647 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006648 PyObject *restuple;
6649 PyObject *resunicode;
6650
6651 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006652 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006653 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006655 }
6656
Benjamin Petersonbac79492012-01-14 13:34:47 -05006657 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006658 return NULL;
6659 len = PyUnicode_GET_LENGTH(unicode);
6660
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006661 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006662 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006663 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665
6666 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006670 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006671 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 Py_DECREF(restuple);
6673 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006674 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006675 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006676 &resunicode, newpos)) {
6677 Py_DECREF(restuple);
6678 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006679 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006680 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6681 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6682 Py_DECREF(restuple);
6683 return NULL;
6684 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006686 *newpos = len + *newpos;
6687 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006688 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 Py_DECREF(restuple);
6690 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006691 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006692 Py_INCREF(resunicode);
6693 Py_DECREF(restuple);
6694 return resunicode;
6695}
6696
Alexander Belopolsky40018472011-02-26 01:02:56 +00006697static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006698unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006699 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006700 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006701{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006702 /* input state */
6703 Py_ssize_t pos=0, size;
6704 int kind;
6705 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006706 /* pointer into the output */
6707 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006708 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6709 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006710 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006711 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006712 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006713 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006714 /* output object */
6715 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006716
Benjamin Petersonbac79492012-01-14 13:34:47 -05006717 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006718 return NULL;
6719 size = PyUnicode_GET_LENGTH(unicode);
6720 kind = PyUnicode_KIND(unicode);
6721 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722 /* allocate enough for a simple encoding without
6723 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006724 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006725 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006726
6727 _PyBytesWriter_Init(&writer);
6728 str = _PyBytesWriter_Alloc(&writer, size);
6729 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006730 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006731
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006732 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006733 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006734
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006736 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006738 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006739 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006740 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006742 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006744 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006745 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006747
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006748 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006749 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006750
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006751 /* Only overallocate the buffer if it's not the last write */
6752 writer.overallocate = (collend < size);
6753
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006755 if (error_handler == _Py_ERROR_UNKNOWN)
6756 error_handler = get_error_handler(errors);
6757
6758 switch (error_handler) {
6759 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006760 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006762
6763 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006764 memset(str, '?', collend - collstart);
6765 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006766 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006767 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006768 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 break;
Victor Stinner50149202015-09-22 00:26:54 +02006770
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006771 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006772 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006773 writer.min_size -= (collend - collstart);
6774 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006775 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006776 if (str == NULL)
6777 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006778 pos = collend;
6779 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006780
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006781 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006782 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006783 writer.min_size -= (collend - collstart);
6784 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006785 unicode, collstart, collend);
6786 if (str == NULL)
6787 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006788 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 break;
Victor Stinner50149202015-09-22 00:26:54 +02006790
Victor Stinnerc3713e92015-09-29 12:32:13 +02006791 case _Py_ERROR_SURROGATEESCAPE:
6792 for (i = collstart; i < collend; ++i) {
6793 ch = PyUnicode_READ(kind, data, i);
6794 if (ch < 0xdc80 || 0xdcff < ch) {
6795 /* Not a UTF-8b surrogate */
6796 break;
6797 }
6798 *str++ = (char)(ch - 0xdc00);
6799 ++pos;
6800 }
6801 if (i >= collend)
6802 break;
6803 collstart = pos;
6804 assert(collstart != collend);
6805 /* fallback to general error handling */
6806
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006808 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6809 encoding, reason, unicode, &exc,
6810 collstart, collend, &newpos);
6811 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006812 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006813
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006814 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006815 writer.min_size -= 1;
6816
Victor Stinner6bd525b2015-10-09 13:10:05 +02006817 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006818 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006819 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006820 PyBytes_AS_STRING(rep),
6821 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006822 if (str == NULL)
6823 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006824 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006825 else {
6826 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006827
Victor Stinner6bd525b2015-10-09 13:10:05 +02006828 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006830
6831 if (PyUnicode_IS_ASCII(rep)) {
6832 /* Fast path: all characters are smaller than limit */
6833 assert(limit >= 128);
6834 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6835 str = _PyBytesWriter_WriteBytes(&writer, str,
6836 PyUnicode_DATA(rep),
6837 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006838 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006839 else {
6840 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6841
6842 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6843 if (str == NULL)
6844 goto onError;
6845
6846 /* check if there is anything unencodable in the
6847 replacement and copy it to the output */
6848 for (i = 0; repsize-->0; ++i, ++str) {
6849 ch = PyUnicode_READ_CHAR(rep, i);
6850 if (ch >= limit) {
6851 raise_encode_exception(&exc, encoding, unicode,
6852 pos, pos+1, reason);
6853 goto onError;
6854 }
6855 *str = (char)ch;
6856 }
6857 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006859 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006860 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006861 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006862
6863 /* If overallocation was disabled, ensure that it was the last
6864 write. Otherwise, we missed an optimization */
6865 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006866 }
6867 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006868
Victor Stinner50149202015-09-22 00:26:54 +02006869 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006870 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006871 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006872
6873 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006874 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006875 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006876 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006877 Py_XDECREF(exc);
6878 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006879}
6880
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006881/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006882PyObject *
6883PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006884 Py_ssize_t size,
6885 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006887 PyObject *result;
6888 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6889 if (unicode == NULL)
6890 return NULL;
6891 result = unicode_encode_ucs1(unicode, errors, 256);
6892 Py_DECREF(unicode);
6893 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894}
6895
Alexander Belopolsky40018472011-02-26 01:02:56 +00006896PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006897_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898{
6899 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006900 PyErr_BadArgument();
6901 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006903 if (PyUnicode_READY(unicode) == -1)
6904 return NULL;
6905 /* Fast path: if it is a one-byte string, construct
6906 bytes object directly. */
6907 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6908 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6909 PyUnicode_GET_LENGTH(unicode));
6910 /* Non-Latin-1 characters present. Defer to above function to
6911 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006912 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006913}
6914
6915PyObject*
6916PyUnicode_AsLatin1String(PyObject *unicode)
6917{
6918 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919}
6920
6921/* --- 7-bit ASCII Codec -------------------------------------------------- */
6922
Alexander Belopolsky40018472011-02-26 01:02:56 +00006923PyObject *
6924PyUnicode_DecodeASCII(const char *s,
6925 Py_ssize_t size,
6926 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006928 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006929 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006930 int kind;
6931 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006932 Py_ssize_t startinpos;
6933 Py_ssize_t endinpos;
6934 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006935 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006936 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006937 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006938 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006939
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006941 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006942
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006944 if (size == 1 && (unsigned char)s[0] < 128)
6945 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006946
Victor Stinner8f674cc2013-04-17 23:02:17 +02006947 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006948 writer.min_length = size;
6949 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006950 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006951
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006952 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006953 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006954 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006955 writer.pos = outpos;
6956 if (writer.pos == size)
6957 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006958
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006959 s += writer.pos;
6960 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006961 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006962 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006963 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006964 PyUnicode_WRITE(kind, data, writer.pos, c);
6965 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006966 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006967 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006969
6970 /* byte outsize range 0x00..0x7f: call the error handler */
6971
6972 if (error_handler == _Py_ERROR_UNKNOWN)
6973 error_handler = get_error_handler(errors);
6974
6975 switch (error_handler)
6976 {
6977 case _Py_ERROR_REPLACE:
6978 case _Py_ERROR_SURROGATEESCAPE:
6979 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006980 but we may switch to UCS2 at the first write */
6981 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6982 goto onError;
6983 kind = writer.kind;
6984 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006985
6986 if (error_handler == _Py_ERROR_REPLACE)
6987 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6988 else
6989 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6990 writer.pos++;
6991 ++s;
6992 break;
6993
6994 case _Py_ERROR_IGNORE:
6995 ++s;
6996 break;
6997
6998 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006999 startinpos = s-starts;
7000 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007001 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007002 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 "ascii", "ordinal not in range(128)",
7004 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007005 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007007 kind = writer.kind;
7008 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007011 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007012 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007013 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007014
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007016 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007017 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007018 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 return NULL;
7020}
7021
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007022/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007023PyObject *
7024PyUnicode_EncodeASCII(const Py_UNICODE *p,
7025 Py_ssize_t size,
7026 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007028 PyObject *result;
7029 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7030 if (unicode == NULL)
7031 return NULL;
7032 result = unicode_encode_ucs1(unicode, errors, 128);
7033 Py_DECREF(unicode);
7034 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035}
7036
Alexander Belopolsky40018472011-02-26 01:02:56 +00007037PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007038_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039{
7040 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007041 PyErr_BadArgument();
7042 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007044 if (PyUnicode_READY(unicode) == -1)
7045 return NULL;
7046 /* Fast path: if it is an ASCII-only string, construct bytes object
7047 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007048 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007049 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7050 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007051 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007052}
7053
7054PyObject *
7055PyUnicode_AsASCIIString(PyObject *unicode)
7056{
7057 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058}
7059
Victor Stinner99b95382011-07-04 14:23:54 +02007060#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007061
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007062/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007063
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007064#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007065#define NEED_RETRY
7066#endif
7067
Victor Stinner3a50e702011-10-18 21:21:00 +02007068#ifndef WC_ERR_INVALID_CHARS
7069# define WC_ERR_INVALID_CHARS 0x0080
7070#endif
7071
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007072static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007073code_page_name(UINT code_page, PyObject **obj)
7074{
7075 *obj = NULL;
7076 if (code_page == CP_ACP)
7077 return "mbcs";
7078 if (code_page == CP_UTF7)
7079 return "CP_UTF7";
7080 if (code_page == CP_UTF8)
7081 return "CP_UTF8";
7082
7083 *obj = PyBytes_FromFormat("cp%u", code_page);
7084 if (*obj == NULL)
7085 return NULL;
7086 return PyBytes_AS_STRING(*obj);
7087}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088
Victor Stinner3a50e702011-10-18 21:21:00 +02007089static DWORD
7090decode_code_page_flags(UINT code_page)
7091{
7092 if (code_page == CP_UTF7) {
7093 /* The CP_UTF7 decoder only supports flags=0 */
7094 return 0;
7095 }
7096 else
7097 return MB_ERR_INVALID_CHARS;
7098}
7099
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007101 * Decode a byte string from a Windows code page into unicode object in strict
7102 * mode.
7103 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007104 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7105 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007106 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007107static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007108decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007109 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007110 const char *in,
7111 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007112{
Victor Stinner3a50e702011-10-18 21:21:00 +02007113 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007114 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007115 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007116
7117 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007118 assert(insize > 0);
7119 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7120 if (outsize <= 0)
7121 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007122
7123 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007124 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007125 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007126 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007127 if (*v == NULL)
7128 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007129 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007130 }
7131 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007133 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007134 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007136 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137 }
7138
7139 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7141 if (outsize <= 0)
7142 goto error;
7143 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007144
Victor Stinner3a50e702011-10-18 21:21:00 +02007145error:
7146 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7147 return -2;
7148 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007149 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007150}
7151
Victor Stinner3a50e702011-10-18 21:21:00 +02007152/*
7153 * Decode a byte string from a code page into unicode object with an error
7154 * handler.
7155 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007156 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007157 * UnicodeDecodeError exception and returns -1 on error.
7158 */
7159static int
7160decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007161 PyObject **v,
7162 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007163 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007164{
7165 const char *startin = in;
7166 const char *endin = in + size;
7167 const DWORD flags = decode_code_page_flags(code_page);
7168 /* Ideally, we should get reason from FormatMessage. This is the Windows
7169 2000 English version of the message. */
7170 const char *reason = "No mapping for the Unicode character exists "
7171 "in the target code page.";
7172 /* each step cannot decode more than 1 character, but a character can be
7173 represented as a surrogate pair */
7174 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007175 int insize;
7176 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007177 PyObject *errorHandler = NULL;
7178 PyObject *exc = NULL;
7179 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007180 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007181 DWORD err;
7182 int ret = -1;
7183
7184 assert(size > 0);
7185
7186 encoding = code_page_name(code_page, &encoding_obj);
7187 if (encoding == NULL)
7188 return -1;
7189
Victor Stinner7d00cc12014-03-17 23:08:06 +01007190 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007191 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7192 UnicodeDecodeError. */
7193 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7194 if (exc != NULL) {
7195 PyCodec_StrictErrors(exc);
7196 Py_CLEAR(exc);
7197 }
7198 goto error;
7199 }
7200
7201 if (*v == NULL) {
7202 /* Create unicode object */
7203 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7204 PyErr_NoMemory();
7205 goto error;
7206 }
Victor Stinnerab595942011-12-17 04:59:06 +01007207 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007208 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 if (*v == NULL)
7210 goto error;
7211 startout = PyUnicode_AS_UNICODE(*v);
7212 }
7213 else {
7214 /* Extend unicode object */
7215 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7216 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7217 PyErr_NoMemory();
7218 goto error;
7219 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007220 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 goto error;
7222 startout = PyUnicode_AS_UNICODE(*v) + n;
7223 }
7224
7225 /* Decode the byte string character per character */
7226 out = startout;
7227 while (in < endin)
7228 {
7229 /* Decode a character */
7230 insize = 1;
7231 do
7232 {
7233 outsize = MultiByteToWideChar(code_page, flags,
7234 in, insize,
7235 buffer, Py_ARRAY_LENGTH(buffer));
7236 if (outsize > 0)
7237 break;
7238 err = GetLastError();
7239 if (err != ERROR_NO_UNICODE_TRANSLATION
7240 && err != ERROR_INSUFFICIENT_BUFFER)
7241 {
7242 PyErr_SetFromWindowsErr(0);
7243 goto error;
7244 }
7245 insize++;
7246 }
7247 /* 4=maximum length of a UTF-8 sequence */
7248 while (insize <= 4 && (in + insize) <= endin);
7249
7250 if (outsize <= 0) {
7251 Py_ssize_t startinpos, endinpos, outpos;
7252
Victor Stinner7d00cc12014-03-17 23:08:06 +01007253 /* last character in partial decode? */
7254 if (in + insize >= endin && !final)
7255 break;
7256
Victor Stinner3a50e702011-10-18 21:21:00 +02007257 startinpos = in - startin;
7258 endinpos = startinpos + 1;
7259 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007260 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007261 errors, &errorHandler,
7262 encoding, reason,
7263 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007264 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 {
7266 goto error;
7267 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007268 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007269 }
7270 else {
7271 in += insize;
7272 memcpy(out, buffer, outsize * sizeof(wchar_t));
7273 out += outsize;
7274 }
7275 }
7276
7277 /* write a NUL character at the end */
7278 *out = 0;
7279
7280 /* Extend unicode object */
7281 outsize = out - startout;
7282 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007283 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007284 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007285 /* (in - startin) <= size and size is an int */
7286 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007287
7288error:
7289 Py_XDECREF(encoding_obj);
7290 Py_XDECREF(errorHandler);
7291 Py_XDECREF(exc);
7292 return ret;
7293}
7294
Victor Stinner3a50e702011-10-18 21:21:00 +02007295static PyObject *
7296decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007297 const char *s, Py_ssize_t size,
7298 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007299{
Victor Stinner76a31a62011-11-04 00:05:13 +01007300 PyObject *v = NULL;
7301 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007302
Victor Stinner3a50e702011-10-18 21:21:00 +02007303 if (code_page < 0) {
7304 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7305 return NULL;
7306 }
7307
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007308 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007310
Victor Stinner76a31a62011-11-04 00:05:13 +01007311 do
7312 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007313#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007314 if (size > INT_MAX) {
7315 chunk_size = INT_MAX;
7316 final = 0;
7317 done = 0;
7318 }
7319 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007321 {
7322 chunk_size = (int)size;
7323 final = (consumed == NULL);
7324 done = 1;
7325 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007326
Victor Stinner76a31a62011-11-04 00:05:13 +01007327 if (chunk_size == 0 && done) {
7328 if (v != NULL)
7329 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007330 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007331 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007332
Victor Stinner76a31a62011-11-04 00:05:13 +01007333 converted = decode_code_page_strict(code_page, &v,
7334 s, chunk_size);
7335 if (converted == -2)
7336 converted = decode_code_page_errors(code_page, &v,
7337 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007338 errors, final);
7339 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007340
7341 if (converted < 0) {
7342 Py_XDECREF(v);
7343 return NULL;
7344 }
7345
7346 if (consumed)
7347 *consumed += converted;
7348
7349 s += converted;
7350 size -= converted;
7351 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007352
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007353 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007354}
7355
Alexander Belopolsky40018472011-02-26 01:02:56 +00007356PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007357PyUnicode_DecodeCodePageStateful(int code_page,
7358 const char *s,
7359 Py_ssize_t size,
7360 const char *errors,
7361 Py_ssize_t *consumed)
7362{
7363 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7364}
7365
7366PyObject *
7367PyUnicode_DecodeMBCSStateful(const char *s,
7368 Py_ssize_t size,
7369 const char *errors,
7370 Py_ssize_t *consumed)
7371{
7372 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7373}
7374
7375PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007376PyUnicode_DecodeMBCS(const char *s,
7377 Py_ssize_t size,
7378 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007379{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007380 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7381}
7382
Victor Stinner3a50e702011-10-18 21:21:00 +02007383static DWORD
7384encode_code_page_flags(UINT code_page, const char *errors)
7385{
7386 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007387 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007388 }
7389 else if (code_page == CP_UTF7) {
7390 /* CP_UTF7 only supports flags=0 */
7391 return 0;
7392 }
7393 else {
7394 if (errors != NULL && strcmp(errors, "replace") == 0)
7395 return 0;
7396 else
7397 return WC_NO_BEST_FIT_CHARS;
7398 }
7399}
7400
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007401/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007402 * Encode a Unicode string to a Windows code page into a byte string in strict
7403 * mode.
7404 *
7405 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007406 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007407 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007408static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007409encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007410 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007411 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007412{
Victor Stinner554f3f02010-06-16 23:33:54 +00007413 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007414 BOOL *pusedDefaultChar = &usedDefaultChar;
7415 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007416 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007417 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007418 const DWORD flags = encode_code_page_flags(code_page, NULL);
7419 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007420 /* Create a substring so that we can get the UTF-16 representation
7421 of just the slice under consideration. */
7422 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007423
Martin v. Löwis3d325192011-11-04 18:23:06 +01007424 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007425
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007427 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007428 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007429 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007430
Victor Stinner2fc507f2011-11-04 20:06:39 +01007431 substring = PyUnicode_Substring(unicode, offset, offset+len);
7432 if (substring == NULL)
7433 return -1;
7434 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7435 if (p == NULL) {
7436 Py_DECREF(substring);
7437 return -1;
7438 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007439 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007440
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007441 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007442 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007443 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007444 NULL, 0,
7445 NULL, pusedDefaultChar);
7446 if (outsize <= 0)
7447 goto error;
7448 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007449 if (pusedDefaultChar && *pusedDefaultChar) {
7450 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007451 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007452 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007453
Victor Stinner3a50e702011-10-18 21:21:00 +02007454 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007456 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007457 if (*outbytes == NULL) {
7458 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007460 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007462 }
7463 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007465 const Py_ssize_t n = PyBytes_Size(*outbytes);
7466 if (outsize > PY_SSIZE_T_MAX - n) {
7467 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007468 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007470 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007471 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7472 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007474 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007476 }
7477
7478 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007479 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007480 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007481 out, outsize,
7482 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007483 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 if (outsize <= 0)
7485 goto error;
7486 if (pusedDefaultChar && *pusedDefaultChar)
7487 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007488 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007489
Victor Stinner3a50e702011-10-18 21:21:00 +02007490error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007491 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7493 return -2;
7494 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007495 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007496}
7497
Victor Stinner3a50e702011-10-18 21:21:00 +02007498/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007499 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007500 * error handler.
7501 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007502 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 * -1 on other error.
7504 */
7505static int
7506encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007507 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007508 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007509{
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007511 Py_ssize_t pos = unicode_offset;
7512 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007513 /* Ideally, we should get reason from FormatMessage. This is the Windows
7514 2000 English version of the message. */
7515 const char *reason = "invalid character";
7516 /* 4=maximum length of a UTF-8 sequence */
7517 char buffer[4];
7518 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7519 Py_ssize_t outsize;
7520 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007521 PyObject *errorHandler = NULL;
7522 PyObject *exc = NULL;
7523 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007524 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007525 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 PyObject *rep;
7527 int ret = -1;
7528
7529 assert(insize > 0);
7530
7531 encoding = code_page_name(code_page, &encoding_obj);
7532 if (encoding == NULL)
7533 return -1;
7534
7535 if (errors == NULL || strcmp(errors, "strict") == 0) {
7536 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7537 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007538 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007539 if (exc != NULL) {
7540 PyCodec_StrictErrors(exc);
7541 Py_DECREF(exc);
7542 }
7543 Py_XDECREF(encoding_obj);
7544 return -1;
7545 }
7546
7547 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7548 pusedDefaultChar = &usedDefaultChar;
7549 else
7550 pusedDefaultChar = NULL;
7551
7552 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7553 PyErr_NoMemory();
7554 goto error;
7555 }
7556 outsize = insize * Py_ARRAY_LENGTH(buffer);
7557
7558 if (*outbytes == NULL) {
7559 /* Create string object */
7560 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7561 if (*outbytes == NULL)
7562 goto error;
7563 out = PyBytes_AS_STRING(*outbytes);
7564 }
7565 else {
7566 /* Extend string object */
7567 Py_ssize_t n = PyBytes_Size(*outbytes);
7568 if (n > PY_SSIZE_T_MAX - outsize) {
7569 PyErr_NoMemory();
7570 goto error;
7571 }
7572 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7573 goto error;
7574 out = PyBytes_AS_STRING(*outbytes) + n;
7575 }
7576
7577 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007578 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007579 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007580 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7581 wchar_t chars[2];
7582 int charsize;
7583 if (ch < 0x10000) {
7584 chars[0] = (wchar_t)ch;
7585 charsize = 1;
7586 }
7587 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007588 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7589 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007590 charsize = 2;
7591 }
7592
Victor Stinner3a50e702011-10-18 21:21:00 +02007593 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007594 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007595 buffer, Py_ARRAY_LENGTH(buffer),
7596 NULL, pusedDefaultChar);
7597 if (outsize > 0) {
7598 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7599 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007600 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007601 memcpy(out, buffer, outsize);
7602 out += outsize;
7603 continue;
7604 }
7605 }
7606 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7607 PyErr_SetFromWindowsErr(0);
7608 goto error;
7609 }
7610
Victor Stinner3a50e702011-10-18 21:21:00 +02007611 rep = unicode_encode_call_errorhandler(
7612 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007613 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007614 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007615 if (rep == NULL)
7616 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007617 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007618
7619 if (PyBytes_Check(rep)) {
7620 outsize = PyBytes_GET_SIZE(rep);
7621 if (outsize != 1) {
7622 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7623 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7624 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7625 Py_DECREF(rep);
7626 goto error;
7627 }
7628 out = PyBytes_AS_STRING(*outbytes) + offset;
7629 }
7630 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7631 out += outsize;
7632 }
7633 else {
7634 Py_ssize_t i;
7635 enum PyUnicode_Kind kind;
7636 void *data;
7637
Benjamin Petersonbac79492012-01-14 13:34:47 -05007638 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007639 Py_DECREF(rep);
7640 goto error;
7641 }
7642
7643 outsize = PyUnicode_GET_LENGTH(rep);
7644 if (outsize != 1) {
7645 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7646 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7647 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7648 Py_DECREF(rep);
7649 goto error;
7650 }
7651 out = PyBytes_AS_STRING(*outbytes) + offset;
7652 }
7653 kind = PyUnicode_KIND(rep);
7654 data = PyUnicode_DATA(rep);
7655 for (i=0; i < outsize; i++) {
7656 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7657 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007658 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007659 encoding, unicode,
7660 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007661 "unable to encode error handler result to ASCII");
7662 Py_DECREF(rep);
7663 goto error;
7664 }
7665 *out = (unsigned char)ch;
7666 out++;
7667 }
7668 }
7669 Py_DECREF(rep);
7670 }
7671 /* write a NUL byte */
7672 *out = 0;
7673 outsize = out - PyBytes_AS_STRING(*outbytes);
7674 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7675 if (_PyBytes_Resize(outbytes, outsize) < 0)
7676 goto error;
7677 ret = 0;
7678
7679error:
7680 Py_XDECREF(encoding_obj);
7681 Py_XDECREF(errorHandler);
7682 Py_XDECREF(exc);
7683 return ret;
7684}
7685
Victor Stinner3a50e702011-10-18 21:21:00 +02007686static PyObject *
7687encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007688 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007689 const char *errors)
7690{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007691 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007692 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007693 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007694 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007695
Victor Stinner29dacf22015-01-26 16:41:32 +01007696 if (!PyUnicode_Check(unicode)) {
7697 PyErr_BadArgument();
7698 return NULL;
7699 }
7700
Benjamin Petersonbac79492012-01-14 13:34:47 -05007701 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007702 return NULL;
7703 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007704
Victor Stinner3a50e702011-10-18 21:21:00 +02007705 if (code_page < 0) {
7706 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7707 return NULL;
7708 }
7709
Martin v. Löwis3d325192011-11-04 18:23:06 +01007710 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007711 return PyBytes_FromStringAndSize(NULL, 0);
7712
Victor Stinner7581cef2011-11-03 22:32:33 +01007713 offset = 0;
7714 do
7715 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007716#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007717 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007718 chunks. */
7719 if (len > INT_MAX/2) {
7720 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007721 done = 0;
7722 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007723 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007724#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007725 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007726 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007727 done = 1;
7728 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007729
Victor Stinner76a31a62011-11-04 00:05:13 +01007730 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007731 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007732 errors);
7733 if (ret == -2)
7734 ret = encode_code_page_errors(code_page, &outbytes,
7735 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007736 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007737 if (ret < 0) {
7738 Py_XDECREF(outbytes);
7739 return NULL;
7740 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007741
Victor Stinner7581cef2011-11-03 22:32:33 +01007742 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007743 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007744 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007745
Victor Stinner3a50e702011-10-18 21:21:00 +02007746 return outbytes;
7747}
7748
7749PyObject *
7750PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7751 Py_ssize_t size,
7752 const char *errors)
7753{
Victor Stinner7581cef2011-11-03 22:32:33 +01007754 PyObject *unicode, *res;
7755 unicode = PyUnicode_FromUnicode(p, size);
7756 if (unicode == NULL)
7757 return NULL;
7758 res = encode_code_page(CP_ACP, unicode, errors);
7759 Py_DECREF(unicode);
7760 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007761}
7762
7763PyObject *
7764PyUnicode_EncodeCodePage(int code_page,
7765 PyObject *unicode,
7766 const char *errors)
7767{
Victor Stinner7581cef2011-11-03 22:32:33 +01007768 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007769}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007770
Alexander Belopolsky40018472011-02-26 01:02:56 +00007771PyObject *
7772PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007773{
Victor Stinner7581cef2011-11-03 22:32:33 +01007774 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007775}
7776
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007777#undef NEED_RETRY
7778
Victor Stinner99b95382011-07-04 14:23:54 +02007779#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007780
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781/* --- Character Mapping Codec -------------------------------------------- */
7782
Victor Stinnerfb161b12013-04-18 01:44:27 +02007783static int
7784charmap_decode_string(const char *s,
7785 Py_ssize_t size,
7786 PyObject *mapping,
7787 const char *errors,
7788 _PyUnicodeWriter *writer)
7789{
7790 const char *starts = s;
7791 const char *e;
7792 Py_ssize_t startinpos, endinpos;
7793 PyObject *errorHandler = NULL, *exc = NULL;
7794 Py_ssize_t maplen;
7795 enum PyUnicode_Kind mapkind;
7796 void *mapdata;
7797 Py_UCS4 x;
7798 unsigned char ch;
7799
7800 if (PyUnicode_READY(mapping) == -1)
7801 return -1;
7802
7803 maplen = PyUnicode_GET_LENGTH(mapping);
7804 mapdata = PyUnicode_DATA(mapping);
7805 mapkind = PyUnicode_KIND(mapping);
7806
7807 e = s + size;
7808
7809 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7810 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7811 * is disabled in encoding aliases, latin1 is preferred because
7812 * its implementation is faster. */
7813 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7814 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7815 Py_UCS4 maxchar = writer->maxchar;
7816
7817 assert (writer->kind == PyUnicode_1BYTE_KIND);
7818 while (s < e) {
7819 ch = *s;
7820 x = mapdata_ucs1[ch];
7821 if (x > maxchar) {
7822 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7823 goto onError;
7824 maxchar = writer->maxchar;
7825 outdata = (Py_UCS1 *)writer->data;
7826 }
7827 outdata[writer->pos] = x;
7828 writer->pos++;
7829 ++s;
7830 }
7831 return 0;
7832 }
7833
7834 while (s < e) {
7835 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7836 enum PyUnicode_Kind outkind = writer->kind;
7837 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7838 if (outkind == PyUnicode_1BYTE_KIND) {
7839 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7840 Py_UCS4 maxchar = writer->maxchar;
7841 while (s < e) {
7842 ch = *s;
7843 x = mapdata_ucs2[ch];
7844 if (x > maxchar)
7845 goto Error;
7846 outdata[writer->pos] = x;
7847 writer->pos++;
7848 ++s;
7849 }
7850 break;
7851 }
7852 else if (outkind == PyUnicode_2BYTE_KIND) {
7853 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7854 while (s < e) {
7855 ch = *s;
7856 x = mapdata_ucs2[ch];
7857 if (x == 0xFFFE)
7858 goto Error;
7859 outdata[writer->pos] = x;
7860 writer->pos++;
7861 ++s;
7862 }
7863 break;
7864 }
7865 }
7866 ch = *s;
7867
7868 if (ch < maplen)
7869 x = PyUnicode_READ(mapkind, mapdata, ch);
7870 else
7871 x = 0xfffe; /* invalid value */
7872Error:
7873 if (x == 0xfffe)
7874 {
7875 /* undefined mapping */
7876 startinpos = s-starts;
7877 endinpos = startinpos+1;
7878 if (unicode_decode_call_errorhandler_writer(
7879 errors, &errorHandler,
7880 "charmap", "character maps to <undefined>",
7881 &starts, &e, &startinpos, &endinpos, &exc, &s,
7882 writer)) {
7883 goto onError;
7884 }
7885 continue;
7886 }
7887
7888 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7889 goto onError;
7890 ++s;
7891 }
7892 Py_XDECREF(errorHandler);
7893 Py_XDECREF(exc);
7894 return 0;
7895
7896onError:
7897 Py_XDECREF(errorHandler);
7898 Py_XDECREF(exc);
7899 return -1;
7900}
7901
7902static int
7903charmap_decode_mapping(const char *s,
7904 Py_ssize_t size,
7905 PyObject *mapping,
7906 const char *errors,
7907 _PyUnicodeWriter *writer)
7908{
7909 const char *starts = s;
7910 const char *e;
7911 Py_ssize_t startinpos, endinpos;
7912 PyObject *errorHandler = NULL, *exc = NULL;
7913 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007914 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007915
7916 e = s + size;
7917
7918 while (s < e) {
7919 ch = *s;
7920
7921 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7922 key = PyLong_FromLong((long)ch);
7923 if (key == NULL)
7924 goto onError;
7925
7926 item = PyObject_GetItem(mapping, key);
7927 Py_DECREF(key);
7928 if (item == NULL) {
7929 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7930 /* No mapping found means: mapping is undefined. */
7931 PyErr_Clear();
7932 goto Undefined;
7933 } else
7934 goto onError;
7935 }
7936
7937 /* Apply mapping */
7938 if (item == Py_None)
7939 goto Undefined;
7940 if (PyLong_Check(item)) {
7941 long value = PyLong_AS_LONG(item);
7942 if (value == 0xFFFE)
7943 goto Undefined;
7944 if (value < 0 || value > MAX_UNICODE) {
7945 PyErr_Format(PyExc_TypeError,
7946 "character mapping must be in range(0x%lx)",
7947 (unsigned long)MAX_UNICODE + 1);
7948 goto onError;
7949 }
7950
7951 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7952 goto onError;
7953 }
7954 else if (PyUnicode_Check(item)) {
7955 if (PyUnicode_READY(item) == -1)
7956 goto onError;
7957 if (PyUnicode_GET_LENGTH(item) == 1) {
7958 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7959 if (value == 0xFFFE)
7960 goto Undefined;
7961 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7962 goto onError;
7963 }
7964 else {
7965 writer->overallocate = 1;
7966 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7967 goto onError;
7968 }
7969 }
7970 else {
7971 /* wrong return value */
7972 PyErr_SetString(PyExc_TypeError,
7973 "character mapping must return integer, None or str");
7974 goto onError;
7975 }
7976 Py_CLEAR(item);
7977 ++s;
7978 continue;
7979
7980Undefined:
7981 /* undefined mapping */
7982 Py_CLEAR(item);
7983 startinpos = s-starts;
7984 endinpos = startinpos+1;
7985 if (unicode_decode_call_errorhandler_writer(
7986 errors, &errorHandler,
7987 "charmap", "character maps to <undefined>",
7988 &starts, &e, &startinpos, &endinpos, &exc, &s,
7989 writer)) {
7990 goto onError;
7991 }
7992 }
7993 Py_XDECREF(errorHandler);
7994 Py_XDECREF(exc);
7995 return 0;
7996
7997onError:
7998 Py_XDECREF(item);
7999 Py_XDECREF(errorHandler);
8000 Py_XDECREF(exc);
8001 return -1;
8002}
8003
Alexander Belopolsky40018472011-02-26 01:02:56 +00008004PyObject *
8005PyUnicode_DecodeCharmap(const char *s,
8006 Py_ssize_t size,
8007 PyObject *mapping,
8008 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008010 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008011
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 /* Default to Latin-1 */
8013 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008017 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008018 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008019 writer.min_length = size;
8020 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008022
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008023 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008024 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8025 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008026 }
8027 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008028 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8029 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008031 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008032
Benjamin Peterson29060642009-01-31 22:14:21 +00008033 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008034 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 return NULL;
8036}
8037
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008038/* Charmap encoding: the lookup table */
8039
Alexander Belopolsky40018472011-02-26 01:02:56 +00008040struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 PyObject_HEAD
8042 unsigned char level1[32];
8043 int count2, count3;
8044 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008045};
8046
8047static PyObject*
8048encoding_map_size(PyObject *obj, PyObject* args)
8049{
8050 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008051 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053}
8054
8055static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008056 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 PyDoc_STR("Return the size (in bytes) of this object") },
8058 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008059};
8060
8061static void
8062encoding_map_dealloc(PyObject* o)
8063{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008064 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065}
8066
8067static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008068 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 "EncodingMap", /*tp_name*/
8070 sizeof(struct encoding_map), /*tp_basicsize*/
8071 0, /*tp_itemsize*/
8072 /* methods */
8073 encoding_map_dealloc, /*tp_dealloc*/
8074 0, /*tp_print*/
8075 0, /*tp_getattr*/
8076 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008077 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 0, /*tp_repr*/
8079 0, /*tp_as_number*/
8080 0, /*tp_as_sequence*/
8081 0, /*tp_as_mapping*/
8082 0, /*tp_hash*/
8083 0, /*tp_call*/
8084 0, /*tp_str*/
8085 0, /*tp_getattro*/
8086 0, /*tp_setattro*/
8087 0, /*tp_as_buffer*/
8088 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8089 0, /*tp_doc*/
8090 0, /*tp_traverse*/
8091 0, /*tp_clear*/
8092 0, /*tp_richcompare*/
8093 0, /*tp_weaklistoffset*/
8094 0, /*tp_iter*/
8095 0, /*tp_iternext*/
8096 encoding_map_methods, /*tp_methods*/
8097 0, /*tp_members*/
8098 0, /*tp_getset*/
8099 0, /*tp_base*/
8100 0, /*tp_dict*/
8101 0, /*tp_descr_get*/
8102 0, /*tp_descr_set*/
8103 0, /*tp_dictoffset*/
8104 0, /*tp_init*/
8105 0, /*tp_alloc*/
8106 0, /*tp_new*/
8107 0, /*tp_free*/
8108 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008109};
8110
8111PyObject*
8112PyUnicode_BuildEncodingMap(PyObject* string)
8113{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008114 PyObject *result;
8115 struct encoding_map *mresult;
8116 int i;
8117 int need_dict = 0;
8118 unsigned char level1[32];
8119 unsigned char level2[512];
8120 unsigned char *mlevel1, *mlevel2, *mlevel3;
8121 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008122 int kind;
8123 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008124 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008125 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008126
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008127 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008128 PyErr_BadArgument();
8129 return NULL;
8130 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008131 kind = PyUnicode_KIND(string);
8132 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008133 length = PyUnicode_GET_LENGTH(string);
8134 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008135 memset(level1, 0xFF, sizeof level1);
8136 memset(level2, 0xFF, sizeof level2);
8137
8138 /* If there isn't a one-to-one mapping of NULL to \0,
8139 or if there are non-BMP characters, we need to use
8140 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008141 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008142 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008143 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008144 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008145 ch = PyUnicode_READ(kind, data, i);
8146 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008147 need_dict = 1;
8148 break;
8149 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008150 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008151 /* unmapped character */
8152 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008153 l1 = ch >> 11;
8154 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008155 if (level1[l1] == 0xFF)
8156 level1[l1] = count2++;
8157 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008158 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008159 }
8160
8161 if (count2 >= 0xFF || count3 >= 0xFF)
8162 need_dict = 1;
8163
8164 if (need_dict) {
8165 PyObject *result = PyDict_New();
8166 PyObject *key, *value;
8167 if (!result)
8168 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008169 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008170 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008171 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008172 if (!key || !value)
8173 goto failed1;
8174 if (PyDict_SetItem(result, key, value) == -1)
8175 goto failed1;
8176 Py_DECREF(key);
8177 Py_DECREF(value);
8178 }
8179 return result;
8180 failed1:
8181 Py_XDECREF(key);
8182 Py_XDECREF(value);
8183 Py_DECREF(result);
8184 return NULL;
8185 }
8186
8187 /* Create a three-level trie */
8188 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8189 16*count2 + 128*count3 - 1);
8190 if (!result)
8191 return PyErr_NoMemory();
8192 PyObject_Init(result, &EncodingMapType);
8193 mresult = (struct encoding_map*)result;
8194 mresult->count2 = count2;
8195 mresult->count3 = count3;
8196 mlevel1 = mresult->level1;
8197 mlevel2 = mresult->level23;
8198 mlevel3 = mresult->level23 + 16*count2;
8199 memcpy(mlevel1, level1, 32);
8200 memset(mlevel2, 0xFF, 16*count2);
8201 memset(mlevel3, 0, 128*count3);
8202 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008203 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008204 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008205 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8206 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008207 /* unmapped character */
8208 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008209 o1 = ch>>11;
8210 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008211 i2 = 16*mlevel1[o1] + o2;
8212 if (mlevel2[i2] == 0xFF)
8213 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008214 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008215 i3 = 128*mlevel2[i2] + o3;
8216 mlevel3[i3] = i;
8217 }
8218 return result;
8219}
8220
8221static int
Victor Stinner22168992011-11-20 17:09:18 +01008222encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008223{
8224 struct encoding_map *map = (struct encoding_map*)mapping;
8225 int l1 = c>>11;
8226 int l2 = (c>>7) & 0xF;
8227 int l3 = c & 0x7F;
8228 int i;
8229
Victor Stinner22168992011-11-20 17:09:18 +01008230 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008232 if (c == 0)
8233 return 0;
8234 /* level 1*/
8235 i = map->level1[l1];
8236 if (i == 0xFF) {
8237 return -1;
8238 }
8239 /* level 2*/
8240 i = map->level23[16*i+l2];
8241 if (i == 0xFF) {
8242 return -1;
8243 }
8244 /* level 3 */
8245 i = map->level23[16*map->count2 + 128*i + l3];
8246 if (i == 0) {
8247 return -1;
8248 }
8249 return i;
8250}
8251
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008252/* Lookup the character ch in the mapping. If the character
8253 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008254 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008255static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008256charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257{
Christian Heimes217cfd12007-12-02 14:31:20 +00008258 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008259 PyObject *x;
8260
8261 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008263 x = PyObject_GetItem(mapping, w);
8264 Py_DECREF(w);
8265 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8267 /* No mapping found means: mapping is undefined. */
8268 PyErr_Clear();
8269 x = Py_None;
8270 Py_INCREF(x);
8271 return x;
8272 } else
8273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008275 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008277 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 long value = PyLong_AS_LONG(x);
8279 if (value < 0 || value > 255) {
8280 PyErr_SetString(PyExc_TypeError,
8281 "character mapping must be in range(256)");
8282 Py_DECREF(x);
8283 return NULL;
8284 }
8285 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008287 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008288 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 /* wrong return value */
8291 PyErr_Format(PyExc_TypeError,
8292 "character mapping must return integer, bytes or None, not %.400s",
8293 x->ob_type->tp_name);
8294 Py_DECREF(x);
8295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296 }
8297}
8298
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008299static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008300charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008301{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008302 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8303 /* exponentially overallocate to minimize reallocations */
8304 if (requiredsize < 2*outsize)
8305 requiredsize = 2*outsize;
8306 if (_PyBytes_Resize(outobj, requiredsize))
8307 return -1;
8308 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008309}
8310
Benjamin Peterson14339b62009-01-31 16:36:08 +00008311typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008313} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008314/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008315 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008316 space is available. Return a new reference to the object that
8317 was put in the output buffer, or Py_None, if the mapping was undefined
8318 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008319 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008320static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008321charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008322 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008323{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008324 PyObject *rep;
8325 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008326 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008327
Christian Heimes90aa7642007-12-19 02:45:37 +00008328 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008329 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008330 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008331 if (res == -1)
8332 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 if (outsize<requiredsize)
8334 if (charmapencode_resize(outobj, outpos, requiredsize))
8335 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008336 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 outstart[(*outpos)++] = (char)res;
8338 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008339 }
8340
8341 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008342 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008344 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 Py_DECREF(rep);
8346 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008347 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 if (PyLong_Check(rep)) {
8349 Py_ssize_t requiredsize = *outpos+1;
8350 if (outsize<requiredsize)
8351 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8352 Py_DECREF(rep);
8353 return enc_EXCEPTION;
8354 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008355 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008357 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 else {
8359 const char *repchars = PyBytes_AS_STRING(rep);
8360 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8361 Py_ssize_t requiredsize = *outpos+repsize;
8362 if (outsize<requiredsize)
8363 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8364 Py_DECREF(rep);
8365 return enc_EXCEPTION;
8366 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008367 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 memcpy(outstart + *outpos, repchars, repsize);
8369 *outpos += repsize;
8370 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008372 Py_DECREF(rep);
8373 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374}
8375
8376/* handle an error in PyUnicode_EncodeCharmap
8377 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008378static int
8379charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008380 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008382 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008383 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008384{
8385 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008386 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008387 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008388 enum PyUnicode_Kind kind;
8389 void *data;
8390 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008392 Py_ssize_t collstartpos = *inpos;
8393 Py_ssize_t collendpos = *inpos+1;
8394 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395 char *encoding = "charmap";
8396 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008397 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008398 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008399 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400
Benjamin Petersonbac79492012-01-14 13:34:47 -05008401 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008402 return -1;
8403 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404 /* find all unencodable characters */
8405 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008406 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008407 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008408 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008409 val = encoding_map_lookup(ch, mapping);
8410 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 break;
8412 ++collendpos;
8413 continue;
8414 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008415
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008416 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8417 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 if (rep==NULL)
8419 return -1;
8420 else if (rep!=Py_None) {
8421 Py_DECREF(rep);
8422 break;
8423 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008424 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008426 }
8427 /* cache callback name lookup
8428 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008429 if (*error_handler == _Py_ERROR_UNKNOWN)
8430 *error_handler = get_error_handler(errors);
8431
8432 switch (*error_handler) {
8433 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008434 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008435 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008436
8437 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008438 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 x = charmapencode_output('?', mapping, res, respos);
8440 if (x==enc_EXCEPTION) {
8441 return -1;
8442 }
8443 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008444 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 return -1;
8446 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008447 }
8448 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008449 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008450 *inpos = collendpos;
8451 break;
Victor Stinner50149202015-09-22 00:26:54 +02008452
8453 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008454 /* generate replacement (temporarily (mis)uses p) */
8455 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 char buffer[2+29+1+1];
8457 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008458 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 for (cp = buffer; *cp; ++cp) {
8460 x = charmapencode_output(*cp, mapping, res, respos);
8461 if (x==enc_EXCEPTION)
8462 return -1;
8463 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008464 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 return -1;
8466 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008467 }
8468 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008469 *inpos = collendpos;
8470 break;
Victor Stinner50149202015-09-22 00:26:54 +02008471
Benjamin Peterson14339b62009-01-31 16:36:08 +00008472 default:
Victor Stinner50149202015-09-22 00:26:54 +02008473 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008474 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008476 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008478 if (PyBytes_Check(repunicode)) {
8479 /* Directly copy bytes result to output. */
8480 Py_ssize_t outsize = PyBytes_Size(*res);
8481 Py_ssize_t requiredsize;
8482 repsize = PyBytes_Size(repunicode);
8483 requiredsize = *respos + repsize;
8484 if (requiredsize > outsize)
8485 /* Make room for all additional bytes. */
8486 if (charmapencode_resize(res, respos, requiredsize)) {
8487 Py_DECREF(repunicode);
8488 return -1;
8489 }
8490 memcpy(PyBytes_AsString(*res) + *respos,
8491 PyBytes_AsString(repunicode), repsize);
8492 *respos += repsize;
8493 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008494 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008495 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008496 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008497 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008498 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008499 Py_DECREF(repunicode);
8500 return -1;
8501 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008502 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008503 data = PyUnicode_DATA(repunicode);
8504 kind = PyUnicode_KIND(repunicode);
8505 for (index = 0; index < repsize; index++) {
8506 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8507 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008509 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 return -1;
8511 }
8512 else if (x==enc_FAILED) {
8513 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008514 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 return -1;
8516 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008517 }
8518 *inpos = newpos;
8519 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 }
8521 return 0;
8522}
8523
Alexander Belopolsky40018472011-02-26 01:02:56 +00008524PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008525_PyUnicode_EncodeCharmap(PyObject *unicode,
8526 PyObject *mapping,
8527 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008529 /* output object */
8530 PyObject *res = NULL;
8531 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008532 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008533 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008535 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008536 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008538 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008539 void *data;
8540 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541
Benjamin Petersonbac79492012-01-14 13:34:47 -05008542 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008543 return NULL;
8544 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008545 data = PyUnicode_DATA(unicode);
8546 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008547
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548 /* Default to Latin-1 */
8549 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008550 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008552 /* allocate enough for a simple encoding without
8553 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008554 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008555 if (res == NULL)
8556 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008557 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008561 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008563 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 if (x==enc_EXCEPTION) /* error */
8565 goto onError;
8566 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008567 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008569 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 &res, &respos)) {
8571 goto onError;
8572 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008573 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 else
8575 /* done with this character => adjust input position */
8576 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008579 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008580 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008581 if (_PyBytes_Resize(&res, respos) < 0)
8582 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008583
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008584 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008585 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008586 return res;
8587
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008589 Py_XDECREF(res);
8590 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008591 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592 return NULL;
8593}
8594
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008595/* Deprecated */
8596PyObject *
8597PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8598 Py_ssize_t size,
8599 PyObject *mapping,
8600 const char *errors)
8601{
8602 PyObject *result;
8603 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8604 if (unicode == NULL)
8605 return NULL;
8606 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8607 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008608 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008609}
8610
Alexander Belopolsky40018472011-02-26 01:02:56 +00008611PyObject *
8612PyUnicode_AsCharmapString(PyObject *unicode,
8613 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614{
8615 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 PyErr_BadArgument();
8617 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008619 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620}
8621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008622/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008623static void
8624make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008626 Py_ssize_t startpos, Py_ssize_t endpos,
8627 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008629 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 *exceptionObject = _PyUnicodeTranslateError_Create(
8631 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632 }
8633 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8635 goto onError;
8636 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8637 goto onError;
8638 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8639 goto onError;
8640 return;
8641 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008642 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643 }
8644}
8645
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008646/* error handling callback helper:
8647 build arguments, call the callback and check the arguments,
8648 put the result into newpos and return the replacement string, which
8649 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008650static PyObject *
8651unicode_translate_call_errorhandler(const char *errors,
8652 PyObject **errorHandler,
8653 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008655 Py_ssize_t startpos, Py_ssize_t endpos,
8656 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008657{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02008658 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008659
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008660 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661 PyObject *restuple;
8662 PyObject *resunicode;
8663
8664 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668 }
8669
8670 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008672 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674
8675 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008677 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008680 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 Py_DECREF(restuple);
8682 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 }
8684 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 &resunicode, &i_newpos)) {
8686 Py_DECREF(restuple);
8687 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008688 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008689 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008691 else
8692 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008694 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 Py_DECREF(restuple);
8696 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008697 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008698 Py_INCREF(resunicode);
8699 Py_DECREF(restuple);
8700 return resunicode;
8701}
8702
8703/* Lookup the character ch in the mapping and put the result in result,
8704 which must be decrefed by the caller.
8705 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008706static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008708{
Christian Heimes217cfd12007-12-02 14:31:20 +00008709 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008710 PyObject *x;
8711
8712 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714 x = PyObject_GetItem(mapping, w);
8715 Py_DECREF(w);
8716 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8718 /* No mapping found means: use 1:1 mapping. */
8719 PyErr_Clear();
8720 *result = NULL;
8721 return 0;
8722 } else
8723 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008724 }
8725 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 *result = x;
8727 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008728 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008729 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008731 if (value < 0 || value > MAX_UNICODE) {
8732 PyErr_Format(PyExc_ValueError,
8733 "character mapping must be in range(0x%x)",
8734 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 Py_DECREF(x);
8736 return -1;
8737 }
8738 *result = x;
8739 return 0;
8740 }
8741 else if (PyUnicode_Check(x)) {
8742 *result = x;
8743 return 0;
8744 }
8745 else {
8746 /* wrong return value */
8747 PyErr_SetString(PyExc_TypeError,
8748 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008749 Py_DECREF(x);
8750 return -1;
8751 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008752}
Victor Stinner1194ea02014-04-04 19:37:40 +02008753
8754/* lookup the character, write the result into the writer.
8755 Return 1 if the result was written into the writer, return 0 if the mapping
8756 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008757static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008758charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8759 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008760{
Victor Stinner1194ea02014-04-04 19:37:40 +02008761 PyObject *item;
8762
8763 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008765
8766 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008768 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008769 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008771 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008772 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008773
8774 if (item == Py_None) {
8775 Py_DECREF(item);
8776 return 0;
8777 }
8778
8779 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008780 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8781 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8782 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008783 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8784 Py_DECREF(item);
8785 return -1;
8786 }
8787 Py_DECREF(item);
8788 return 1;
8789 }
8790
8791 if (!PyUnicode_Check(item)) {
8792 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008793 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008794 }
8795
8796 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8797 Py_DECREF(item);
8798 return -1;
8799 }
8800
8801 Py_DECREF(item);
8802 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008803}
8804
Victor Stinner89a76ab2014-04-05 11:44:04 +02008805static int
8806unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8807 Py_UCS1 *translate)
8808{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008809 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008810 int ret = 0;
8811
Victor Stinner89a76ab2014-04-05 11:44:04 +02008812 if (charmaptranslate_lookup(ch, mapping, &item)) {
8813 return -1;
8814 }
8815
8816 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008817 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008818 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008819 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008820 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008821 /* not found => default to 1:1 mapping */
8822 translate[ch] = ch;
8823 return 1;
8824 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008825 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008826 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008827 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8828 used it */
8829 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008830 /* invalid character or character outside ASCII:
8831 skip the fast translate */
8832 goto exit;
8833 }
8834 translate[ch] = (Py_UCS1)replace;
8835 }
8836 else if (PyUnicode_Check(item)) {
8837 Py_UCS4 replace;
8838
8839 if (PyUnicode_READY(item) == -1) {
8840 Py_DECREF(item);
8841 return -1;
8842 }
8843 if (PyUnicode_GET_LENGTH(item) != 1)
8844 goto exit;
8845
8846 replace = PyUnicode_READ_CHAR(item, 0);
8847 if (replace > 127)
8848 goto exit;
8849 translate[ch] = (Py_UCS1)replace;
8850 }
8851 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008852 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008853 goto exit;
8854 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008855 ret = 1;
8856
Benjamin Peterson1365de72014-04-07 20:15:41 -04008857 exit:
8858 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008859 return ret;
8860}
8861
8862/* Fast path for ascii => ascii translation. Return 1 if the whole string
8863 was translated into writer, return 0 if the input string was partially
8864 translated into writer, raise an exception and return -1 on error. */
8865static int
8866unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008867 _PyUnicodeWriter *writer, int ignore,
8868 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008869{
Victor Stinner872b2912014-04-05 14:27:07 +02008870 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008871 Py_ssize_t len;
8872 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008873 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008874
Victor Stinner89a76ab2014-04-05 11:44:04 +02008875 len = PyUnicode_GET_LENGTH(input);
8876
Victor Stinner872b2912014-04-05 14:27:07 +02008877 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008878
8879 in = PyUnicode_1BYTE_DATA(input);
8880 end = in + len;
8881
8882 assert(PyUnicode_IS_ASCII(writer->buffer));
8883 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8884 out = PyUnicode_1BYTE_DATA(writer->buffer);
8885
Victor Stinner872b2912014-04-05 14:27:07 +02008886 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008887 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008888 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008889 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008890 int translate = unicode_fast_translate_lookup(mapping, ch,
8891 ascii_table);
8892 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008893 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008894 if (translate == 0)
8895 goto exit;
8896 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008897 }
Victor Stinner872b2912014-04-05 14:27:07 +02008898 if (ch2 == 0xfe) {
8899 if (ignore)
8900 continue;
8901 goto exit;
8902 }
8903 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008904 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008905 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008906 }
Victor Stinner872b2912014-04-05 14:27:07 +02008907 res = 1;
8908
8909exit:
8910 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008911 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008912 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008913}
8914
Victor Stinner3222da22015-10-01 22:07:32 +02008915static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916_PyUnicode_TranslateCharmap(PyObject *input,
8917 PyObject *mapping,
8918 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008921 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922 Py_ssize_t size, i;
8923 int kind;
8924 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008925 _PyUnicodeWriter writer;
8926 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008927 char *reason = "character maps to <undefined>";
8928 PyObject *errorHandler = NULL;
8929 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008930 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008931 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008932
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 PyErr_BadArgument();
8935 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008938 if (PyUnicode_READY(input) == -1)
8939 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008940 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008941 kind = PyUnicode_KIND(input);
8942 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008944 if (size == 0)
8945 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008947 /* allocate enough for a simple 1:1 translation without
8948 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008949 _PyUnicodeWriter_Init(&writer);
8950 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008951 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952
Victor Stinner872b2912014-04-05 14:27:07 +02008953 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8954
Victor Stinner33798672016-03-01 21:59:58 +01008955 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008956 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008957 if (PyUnicode_IS_ASCII(input)) {
8958 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8959 if (res < 0) {
8960 _PyUnicodeWriter_Dealloc(&writer);
8961 return NULL;
8962 }
8963 if (res == 1)
8964 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008965 }
Victor Stinner33798672016-03-01 21:59:58 +01008966 else {
8967 i = 0;
8968 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008971 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008972 int translate;
8973 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8974 Py_ssize_t newpos;
8975 /* startpos for collecting untranslatable chars */
8976 Py_ssize_t collstart;
8977 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008978 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979
Victor Stinner1194ea02014-04-04 19:37:40 +02008980 ch = PyUnicode_READ(kind, data, i);
8981 translate = charmaptranslate_output(ch, mapping, &writer);
8982 if (translate < 0)
8983 goto onError;
8984
8985 if (translate != 0) {
8986 /* it worked => adjust input pointer */
8987 ++i;
8988 continue;
8989 }
8990
8991 /* untranslatable character */
8992 collstart = i;
8993 collend = i+1;
8994
8995 /* find all untranslatable characters */
8996 while (collend < size) {
8997 PyObject *x;
8998 ch = PyUnicode_READ(kind, data, collend);
8999 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009000 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009001 Py_XDECREF(x);
9002 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009003 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009004 ++collend;
9005 }
9006
9007 if (ignore) {
9008 i = collend;
9009 }
9010 else {
9011 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9012 reason, input, &exc,
9013 collstart, collend, &newpos);
9014 if (repunicode == NULL)
9015 goto onError;
9016 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009017 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009018 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009019 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009020 Py_DECREF(repunicode);
9021 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009022 }
9023 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009024 Py_XDECREF(exc);
9025 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009026 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027
Benjamin Peterson29060642009-01-31 22:14:21 +00009028 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009029 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009030 Py_XDECREF(exc);
9031 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032 return NULL;
9033}
9034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035/* Deprecated. Use PyUnicode_Translate instead. */
9036PyObject *
9037PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9038 Py_ssize_t size,
9039 PyObject *mapping,
9040 const char *errors)
9041{
Christian Heimes5f520f42012-09-11 14:03:25 +02009042 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009043 PyObject *unicode = PyUnicode_FromUnicode(p, size);
9044 if (!unicode)
9045 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009046 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9047 Py_DECREF(unicode);
9048 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049}
9050
Alexander Belopolsky40018472011-02-26 01:02:56 +00009051PyObject *
9052PyUnicode_Translate(PyObject *str,
9053 PyObject *mapping,
9054 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009056 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009057 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009058 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059}
Tim Petersced69f82003-09-16 20:30:58 +00009060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009062fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009063{
9064 /* No need to call PyUnicode_READY(self) because this function is only
9065 called as a callback from fixup() which does it already. */
9066 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9067 const int kind = PyUnicode_KIND(self);
9068 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009069 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009070 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071 Py_ssize_t i;
9072
9073 for (i = 0; i < len; ++i) {
9074 ch = PyUnicode_READ(kind, data, i);
9075 fixed = 0;
9076 if (ch > 127) {
9077 if (Py_UNICODE_ISSPACE(ch))
9078 fixed = ' ';
9079 else {
9080 const int decimal = Py_UNICODE_TODECIMAL(ch);
9081 if (decimal >= 0)
9082 fixed = '0' + decimal;
9083 }
9084 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009085 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009086 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009087 PyUnicode_WRITE(kind, data, i, fixed);
9088 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009089 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009090 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092 }
9093
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009094 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095}
9096
9097PyObject *
9098_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9099{
9100 if (!PyUnicode_Check(unicode)) {
9101 PyErr_BadInternalCall();
9102 return NULL;
9103 }
9104 if (PyUnicode_READY(unicode) == -1)
9105 return NULL;
9106 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9107 /* If the string is already ASCII, just return the same string */
9108 Py_INCREF(unicode);
9109 return unicode;
9110 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009111 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112}
9113
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009114PyObject *
9115PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9116 Py_ssize_t length)
9117{
Victor Stinnerf0124502011-11-21 23:12:56 +01009118 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009119 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009120 Py_UCS4 maxchar;
9121 enum PyUnicode_Kind kind;
9122 void *data;
9123
Victor Stinner99d7ad02012-02-22 13:37:39 +01009124 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009125 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009126 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009127 if (ch > 127) {
9128 int decimal = Py_UNICODE_TODECIMAL(ch);
9129 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009130 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009131 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009132 }
9133 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009134
9135 /* Copy to a new string */
9136 decimal = PyUnicode_New(length, maxchar);
9137 if (decimal == NULL)
9138 return decimal;
9139 kind = PyUnicode_KIND(decimal);
9140 data = PyUnicode_DATA(decimal);
9141 /* Iterate over code points */
9142 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009143 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009144 if (ch > 127) {
9145 int decimal = Py_UNICODE_TODECIMAL(ch);
9146 if (decimal >= 0)
9147 ch = '0' + decimal;
9148 }
9149 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009151 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009152}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009153/* --- Decimal Encoder ---------------------------------------------------- */
9154
Alexander Belopolsky40018472011-02-26 01:02:56 +00009155int
9156PyUnicode_EncodeDecimal(Py_UNICODE *s,
9157 Py_ssize_t length,
9158 char *output,
9159 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009160{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009161 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009162 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009163 enum PyUnicode_Kind kind;
9164 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009165
9166 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 PyErr_BadArgument();
9168 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009169 }
9170
Victor Stinner42bf7752011-11-21 22:52:58 +01009171 unicode = PyUnicode_FromUnicode(s, length);
9172 if (unicode == NULL)
9173 return -1;
9174
Benjamin Petersonbac79492012-01-14 13:34:47 -05009175 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009176 Py_DECREF(unicode);
9177 return -1;
9178 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009179 kind = PyUnicode_KIND(unicode);
9180 data = PyUnicode_DATA(unicode);
9181
Victor Stinnerb84d7232011-11-22 01:50:07 +01009182 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009183 PyObject *exc;
9184 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009185 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009186 Py_ssize_t startpos;
9187
9188 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009189
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009191 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009192 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009193 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009194 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009195 decimal = Py_UNICODE_TODECIMAL(ch);
9196 if (decimal >= 0) {
9197 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009198 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 continue;
9200 }
9201 if (0 < ch && ch < 256) {
9202 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009203 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009204 continue;
9205 }
Victor Stinner6345be92011-11-25 20:09:01 +01009206
Victor Stinner42bf7752011-11-21 22:52:58 +01009207 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009208 exc = NULL;
9209 raise_encode_exception(&exc, "decimal", unicode,
9210 startpos, startpos+1,
9211 "invalid decimal Unicode string");
9212 Py_XDECREF(exc);
9213 Py_DECREF(unicode);
9214 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009215 }
9216 /* 0-terminate the output string */
9217 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009218 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009219 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009220}
9221
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222/* --- Helpers ------------------------------------------------------------ */
9223
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009224/* helper macro to fixup start/end slice values */
9225#define ADJUST_INDICES(start, end, len) \
9226 if (end > len) \
9227 end = len; \
9228 else if (end < 0) { \
9229 end += len; \
9230 if (end < 0) \
9231 end = 0; \
9232 } \
9233 if (start < 0) { \
9234 start += len; \
9235 if (start < 0) \
9236 start = 0; \
9237 }
9238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009240any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009242 Py_ssize_t end,
9243 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009245 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 void *buf1, *buf2;
9247 Py_ssize_t len1, len2, result;
9248
9249 kind1 = PyUnicode_KIND(s1);
9250 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009251 if (kind1 < kind2)
9252 return -1;
9253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254 len1 = PyUnicode_GET_LENGTH(s1);
9255 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009256 ADJUST_INDICES(start, end, len1);
9257 if (end - start < len2)
9258 return -1;
9259
9260 buf1 = PyUnicode_DATA(s1);
9261 buf2 = PyUnicode_DATA(s2);
9262 if (len2 == 1) {
9263 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9264 result = findchar((const char *)buf1 + kind1*start,
9265 kind1, end - start, ch, direction);
9266 if (result == -1)
9267 return -1;
9268 else
9269 return start + result;
9270 }
9271
9272 if (kind2 != kind1) {
9273 buf2 = _PyUnicode_AsKind(s2, kind1);
9274 if (!buf2)
9275 return -2;
9276 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277
Victor Stinner794d5672011-10-10 03:21:36 +02009278 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009279 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009280 case PyUnicode_1BYTE_KIND:
9281 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9282 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9283 else
9284 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9285 break;
9286 case PyUnicode_2BYTE_KIND:
9287 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9288 break;
9289 case PyUnicode_4BYTE_KIND:
9290 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9291 break;
9292 default:
9293 assert(0); result = -2;
9294 }
9295 }
9296 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009297 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009298 case PyUnicode_1BYTE_KIND:
9299 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9300 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9301 else
9302 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9303 break;
9304 case PyUnicode_2BYTE_KIND:
9305 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9306 break;
9307 case PyUnicode_4BYTE_KIND:
9308 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9309 break;
9310 default:
9311 assert(0); result = -2;
9312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 }
9314
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009315 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 PyMem_Free(buf2);
9317
9318 return result;
9319}
9320
9321Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009322_PyUnicode_InsertThousandsGrouping(
9323 PyObject *unicode, Py_ssize_t index,
9324 Py_ssize_t n_buffer,
9325 void *digits, Py_ssize_t n_digits,
9326 Py_ssize_t min_width,
9327 const char *grouping, PyObject *thousands_sep,
9328 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329{
Victor Stinner41a863c2012-02-24 00:37:51 +01009330 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009331 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009332 Py_ssize_t thousands_sep_len;
9333 Py_ssize_t len;
9334
9335 if (unicode != NULL) {
9336 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009337 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009338 }
9339 else {
9340 kind = PyUnicode_1BYTE_KIND;
9341 data = NULL;
9342 }
9343 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9344 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9345 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9346 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009347 if (thousands_sep_kind < kind) {
9348 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9349 if (!thousands_sep_data)
9350 return -1;
9351 }
9352 else {
9353 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9354 if (!data)
9355 return -1;
9356 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009357 }
9358
Benjamin Petersonead6b532011-12-20 17:23:42 -06009359 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009361 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009362 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009363 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009364 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009365 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009366 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009367 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009368 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009369 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009370 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009371 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009373 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009374 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009375 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009376 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009377 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009379 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009380 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009381 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009382 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009383 break;
9384 default:
9385 assert(0);
9386 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009388 if (unicode != NULL && thousands_sep_kind != kind) {
9389 if (thousands_sep_kind < kind)
9390 PyMem_Free(thousands_sep_data);
9391 else
9392 PyMem_Free(data);
9393 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009394 if (unicode == NULL) {
9395 *maxchar = 127;
9396 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009397 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009398 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009399 }
9400 }
9401 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402}
9403
9404
Alexander Belopolsky40018472011-02-26 01:02:56 +00009405Py_ssize_t
9406PyUnicode_Count(PyObject *str,
9407 PyObject *substr,
9408 Py_ssize_t start,
9409 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009411 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009412 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 void *buf1 = NULL, *buf2 = NULL;
9414 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009415
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009416 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009417 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009418
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009419 kind1 = PyUnicode_KIND(str);
9420 kind2 = PyUnicode_KIND(substr);
9421 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009422 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009423
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009424 len1 = PyUnicode_GET_LENGTH(str);
9425 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009426 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009427 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009428 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009429
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009430 buf1 = PyUnicode_DATA(str);
9431 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009432 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009433 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009434 if (!buf2)
9435 goto onError;
9436 }
9437
9438 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009440 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009441 result = asciilib_count(
9442 ((Py_UCS1*)buf1) + start, end - start,
9443 buf2, len2, PY_SSIZE_T_MAX
9444 );
9445 else
9446 result = ucs1lib_count(
9447 ((Py_UCS1*)buf1) + start, end - start,
9448 buf2, len2, PY_SSIZE_T_MAX
9449 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 break;
9451 case PyUnicode_2BYTE_KIND:
9452 result = ucs2lib_count(
9453 ((Py_UCS2*)buf1) + start, end - start,
9454 buf2, len2, PY_SSIZE_T_MAX
9455 );
9456 break;
9457 case PyUnicode_4BYTE_KIND:
9458 result = ucs4lib_count(
9459 ((Py_UCS4*)buf1) + start, end - start,
9460 buf2, len2, PY_SSIZE_T_MAX
9461 );
9462 break;
9463 default:
9464 assert(0); result = 0;
9465 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009466
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009467 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 PyMem_Free(buf2);
9469
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009472 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 PyMem_Free(buf2);
9474 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475}
9476
Alexander Belopolsky40018472011-02-26 01:02:56 +00009477Py_ssize_t
9478PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009479 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009480 Py_ssize_t start,
9481 Py_ssize_t end,
9482 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009484 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009485 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009486
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009487 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488}
9489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490Py_ssize_t
9491PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9492 Py_ssize_t start, Py_ssize_t end,
9493 int direction)
9494{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009496 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497 if (PyUnicode_READY(str) == -1)
9498 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009499 if (start < 0 || end < 0) {
9500 PyErr_SetString(PyExc_IndexError, "string index out of range");
9501 return -2;
9502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 if (end > PyUnicode_GET_LENGTH(str))
9504 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009505 if (start >= end)
9506 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009508 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9509 kind, end-start, ch, direction);
9510 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009512 else
9513 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514}
9515
Alexander Belopolsky40018472011-02-26 01:02:56 +00009516static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009517tailmatch(PyObject *self,
9518 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009519 Py_ssize_t start,
9520 Py_ssize_t end,
9521 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 int kind_self;
9524 int kind_sub;
9525 void *data_self;
9526 void *data_sub;
9527 Py_ssize_t offset;
9528 Py_ssize_t i;
9529 Py_ssize_t end_sub;
9530
9531 if (PyUnicode_READY(self) == -1 ||
9532 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009533 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9536 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009538 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009540 if (PyUnicode_GET_LENGTH(substring) == 0)
9541 return 1;
9542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 kind_self = PyUnicode_KIND(self);
9544 data_self = PyUnicode_DATA(self);
9545 kind_sub = PyUnicode_KIND(substring);
9546 data_sub = PyUnicode_DATA(substring);
9547 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9548
9549 if (direction > 0)
9550 offset = end;
9551 else
9552 offset = start;
9553
9554 if (PyUnicode_READ(kind_self, data_self, offset) ==
9555 PyUnicode_READ(kind_sub, data_sub, 0) &&
9556 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9557 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9558 /* If both are of the same kind, memcmp is sufficient */
9559 if (kind_self == kind_sub) {
9560 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009561 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 data_sub,
9563 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009564 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009566 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 else {
9568 /* We do not need to compare 0 and len(substring)-1 because
9569 the if statement above ensured already that they are equal
9570 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 for (i = 1; i < end_sub; ++i) {
9572 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9573 PyUnicode_READ(kind_sub, data_sub, i))
9574 return 0;
9575 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009576 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578 }
9579
9580 return 0;
9581}
9582
Alexander Belopolsky40018472011-02-26 01:02:56 +00009583Py_ssize_t
9584PyUnicode_Tailmatch(PyObject *str,
9585 PyObject *substr,
9586 Py_ssize_t start,
9587 Py_ssize_t end,
9588 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009590 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009591 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009592
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009593 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009594}
9595
Guido van Rossumd57fd912000-03-10 22:53:23 +00009596/* Apply fixfct filter to the Unicode object self and return a
9597 reference to the modified object */
9598
Alexander Belopolsky40018472011-02-26 01:02:56 +00009599static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009600fixup(PyObject *self,
9601 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009602{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 PyObject *u;
9604 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009605 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009606
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009607 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009609 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009610 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 /* fix functions return the new maximum character in a string,
9613 if the kind of the resulting unicode object does not change,
9614 everything is fine. Otherwise we need to change the string kind
9615 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009616 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009617
9618 if (maxchar_new == 0) {
9619 /* no changes */;
9620 if (PyUnicode_CheckExact(self)) {
9621 Py_DECREF(u);
9622 Py_INCREF(self);
9623 return self;
9624 }
9625 else
9626 return u;
9627 }
9628
Victor Stinnere6abb482012-05-02 01:15:40 +02009629 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630
Victor Stinnereaab6042011-12-11 22:22:39 +01009631 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009633
9634 /* In case the maximum character changed, we need to
9635 convert the string to the new category. */
9636 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9637 if (v == NULL) {
9638 Py_DECREF(u);
9639 return NULL;
9640 }
9641 if (maxchar_new > maxchar_old) {
9642 /* If the maxchar increased so that the kind changed, not all
9643 characters are representable anymore and we need to fix the
9644 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009645 _PyUnicode_FastCopyCharacters(v, 0,
9646 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009647 maxchar_old = fixfct(v);
9648 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 }
9650 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009651 _PyUnicode_FastCopyCharacters(v, 0,
9652 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009654 Py_DECREF(u);
9655 assert(_PyUnicode_CheckConsistency(v, 1));
9656 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657}
9658
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009659static PyObject *
9660ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9663 char *resdata, *data = PyUnicode_DATA(self);
9664 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009665
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009666 res = PyUnicode_New(len, 127);
9667 if (res == NULL)
9668 return NULL;
9669 resdata = PyUnicode_DATA(res);
9670 if (lower)
9671 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673 _Py_bytes_upper(resdata, data, len);
9674 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675}
9676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009678handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009680 Py_ssize_t j;
9681 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009682 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009683 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009684
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009685 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9686
9687 where ! is a negation and \p{xxx} is a character with property xxx.
9688 */
9689 for (j = i - 1; j >= 0; j--) {
9690 c = PyUnicode_READ(kind, data, j);
9691 if (!_PyUnicode_IsCaseIgnorable(c))
9692 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009694 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9695 if (final_sigma) {
9696 for (j = i + 1; j < length; j++) {
9697 c = PyUnicode_READ(kind, data, j);
9698 if (!_PyUnicode_IsCaseIgnorable(c))
9699 break;
9700 }
9701 final_sigma = j == length || !_PyUnicode_IsCased(c);
9702 }
9703 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704}
9705
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009706static int
9707lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9708 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009710 /* Obscure special case. */
9711 if (c == 0x3A3) {
9712 mapped[0] = handle_capital_sigma(kind, data, length, i);
9713 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009715 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716}
9717
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009718static Py_ssize_t
9719do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009720{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009721 Py_ssize_t i, k = 0;
9722 int n_res, j;
9723 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009724
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009725 c = PyUnicode_READ(kind, data, 0);
9726 n_res = _PyUnicode_ToUpperFull(c, mapped);
9727 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009728 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009729 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009731 for (i = 1; i < length; i++) {
9732 c = PyUnicode_READ(kind, data, i);
9733 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9734 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009735 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009736 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009737 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009738 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009739 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740}
9741
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009742static Py_ssize_t
9743do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9744 Py_ssize_t i, k = 0;
9745
9746 for (i = 0; i < length; i++) {
9747 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9748 int n_res, j;
9749 if (Py_UNICODE_ISUPPER(c)) {
9750 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9751 }
9752 else if (Py_UNICODE_ISLOWER(c)) {
9753 n_res = _PyUnicode_ToUpperFull(c, mapped);
9754 }
9755 else {
9756 n_res = 1;
9757 mapped[0] = c;
9758 }
9759 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009760 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009761 res[k++] = mapped[j];
9762 }
9763 }
9764 return k;
9765}
9766
9767static Py_ssize_t
9768do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9769 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009771 Py_ssize_t i, k = 0;
9772
9773 for (i = 0; i < length; i++) {
9774 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9775 int n_res, j;
9776 if (lower)
9777 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9778 else
9779 n_res = _PyUnicode_ToUpperFull(c, mapped);
9780 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009781 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009782 res[k++] = mapped[j];
9783 }
9784 }
9785 return k;
9786}
9787
9788static Py_ssize_t
9789do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9790{
9791 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9792}
9793
9794static Py_ssize_t
9795do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9796{
9797 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9798}
9799
Benjamin Petersone51757f2012-01-12 21:10:29 -05009800static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009801do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9802{
9803 Py_ssize_t i, k = 0;
9804
9805 for (i = 0; i < length; i++) {
9806 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9807 Py_UCS4 mapped[3];
9808 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9809 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009810 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009811 res[k++] = mapped[j];
9812 }
9813 }
9814 return k;
9815}
9816
9817static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009818do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9819{
9820 Py_ssize_t i, k = 0;
9821 int previous_is_cased;
9822
9823 previous_is_cased = 0;
9824 for (i = 0; i < length; i++) {
9825 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9826 Py_UCS4 mapped[3];
9827 int n_res, j;
9828
9829 if (previous_is_cased)
9830 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9831 else
9832 n_res = _PyUnicode_ToTitleFull(c, mapped);
9833
9834 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009835 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009836 res[k++] = mapped[j];
9837 }
9838
9839 previous_is_cased = _PyUnicode_IsCased(c);
9840 }
9841 return k;
9842}
9843
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009844static PyObject *
9845case_operation(PyObject *self,
9846 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9847{
9848 PyObject *res = NULL;
9849 Py_ssize_t length, newlength = 0;
9850 int kind, outkind;
9851 void *data, *outdata;
9852 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9853
Benjamin Petersoneea48462012-01-16 14:28:50 -05009854 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009855
9856 kind = PyUnicode_KIND(self);
9857 data = PyUnicode_DATA(self);
9858 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009859 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009860 PyErr_SetString(PyExc_OverflowError, "string is too long");
9861 return NULL;
9862 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009863 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009864 if (tmp == NULL)
9865 return PyErr_NoMemory();
9866 newlength = perform(kind, data, length, tmp, &maxchar);
9867 res = PyUnicode_New(newlength, maxchar);
9868 if (res == NULL)
9869 goto leave;
9870 tmpend = tmp + newlength;
9871 outdata = PyUnicode_DATA(res);
9872 outkind = PyUnicode_KIND(res);
9873 switch (outkind) {
9874 case PyUnicode_1BYTE_KIND:
9875 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9876 break;
9877 case PyUnicode_2BYTE_KIND:
9878 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9879 break;
9880 case PyUnicode_4BYTE_KIND:
9881 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9882 break;
9883 default:
9884 assert(0);
9885 break;
9886 }
9887 leave:
9888 PyMem_FREE(tmp);
9889 return res;
9890}
9891
Tim Peters8ce9f162004-08-27 01:49:32 +00009892PyObject *
9893PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009895 PyObject *res;
9896 PyObject *fseq;
9897 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009898 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009900 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009901 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009902 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009903 }
9904
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009905 /* NOTE: the following code can't call back into Python code,
9906 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009907 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009908
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009909 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009910 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009911 res = _PyUnicode_JoinArray(separator, items, seqlen);
9912 Py_DECREF(fseq);
9913 return res;
9914}
9915
9916PyObject *
9917_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9918{
9919 PyObject *res = NULL; /* the result */
9920 PyObject *sep = NULL;
9921 Py_ssize_t seplen;
9922 PyObject *item;
9923 Py_ssize_t sz, i, res_offset;
9924 Py_UCS4 maxchar;
9925 Py_UCS4 item_maxchar;
9926 int use_memcpy;
9927 unsigned char *res_data = NULL, *sep_data = NULL;
9928 PyObject *last_obj;
9929 unsigned int kind = 0;
9930
Tim Peters05eba1f2004-08-27 21:32:02 +00009931 /* If empty sequence, return u"". */
9932 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009933 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009934 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009935
Tim Peters05eba1f2004-08-27 21:32:02 +00009936 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009937 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009938 if (seqlen == 1) {
9939 if (PyUnicode_CheckExact(items[0])) {
9940 res = items[0];
9941 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009942 return res;
9943 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009944 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009945 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009946 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009947 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009948 /* Set up sep and seplen */
9949 if (separator == NULL) {
9950 /* fall back to a blank space separator */
9951 sep = PyUnicode_FromOrdinal(' ');
9952 if (!sep)
9953 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009954 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009955 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009956 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009957 else {
9958 if (!PyUnicode_Check(separator)) {
9959 PyErr_Format(PyExc_TypeError,
9960 "separator: expected str instance,"
9961 " %.80s found",
9962 Py_TYPE(separator)->tp_name);
9963 goto onError;
9964 }
9965 if (PyUnicode_READY(separator))
9966 goto onError;
9967 sep = separator;
9968 seplen = PyUnicode_GET_LENGTH(separator);
9969 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9970 /* inc refcount to keep this code path symmetric with the
9971 above case of a blank separator */
9972 Py_INCREF(sep);
9973 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009974 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009975 }
9976
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009977 /* There are at least two things to join, or else we have a subclass
9978 * of str in the sequence.
9979 * Do a pre-pass to figure out the total amount of space we'll
9980 * need (sz), and see whether all argument are strings.
9981 */
9982 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009983#ifdef Py_DEBUG
9984 use_memcpy = 0;
9985#else
9986 use_memcpy = 1;
9987#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009988 for (i = 0; i < seqlen; i++) {
9989 const Py_ssize_t old_sz = sz;
9990 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009991 if (!PyUnicode_Check(item)) {
9992 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009993 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009994 " %.80s found",
9995 i, Py_TYPE(item)->tp_name);
9996 goto onError;
9997 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 if (PyUnicode_READY(item) == -1)
9999 goto onError;
10000 sz += PyUnicode_GET_LENGTH(item);
10001 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010002 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010003 if (i != 0)
10004 sz += seplen;
10005 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
10006 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010007 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010008 goto onError;
10009 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010010 if (use_memcpy && last_obj != NULL) {
10011 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10012 use_memcpy = 0;
10013 }
10014 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010015 }
Tim Petersced69f82003-09-16 20:30:58 +000010016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010018 if (res == NULL)
10019 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010020
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010021 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010022#ifdef Py_DEBUG
10023 use_memcpy = 0;
10024#else
10025 if (use_memcpy) {
10026 res_data = PyUnicode_1BYTE_DATA(res);
10027 kind = PyUnicode_KIND(res);
10028 if (seplen != 0)
10029 sep_data = PyUnicode_1BYTE_DATA(sep);
10030 }
10031#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010032 if (use_memcpy) {
10033 for (i = 0; i < seqlen; ++i) {
10034 Py_ssize_t itemlen;
10035 item = items[i];
10036
10037 /* Copy item, and maybe the separator. */
10038 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010039 Py_MEMCPY(res_data,
10040 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010041 kind * seplen);
10042 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010043 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010044
10045 itemlen = PyUnicode_GET_LENGTH(item);
10046 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010047 Py_MEMCPY(res_data,
10048 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010049 kind * itemlen);
10050 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010051 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010052 }
10053 assert(res_data == PyUnicode_1BYTE_DATA(res)
10054 + kind * PyUnicode_GET_LENGTH(res));
10055 }
10056 else {
10057 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10058 Py_ssize_t itemlen;
10059 item = items[i];
10060
10061 /* Copy item, and maybe the separator. */
10062 if (i && seplen != 0) {
10063 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10064 res_offset += seplen;
10065 }
10066
10067 itemlen = PyUnicode_GET_LENGTH(item);
10068 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010069 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010070 res_offset += itemlen;
10071 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010072 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010073 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010074 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010077 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010079
Benjamin Peterson29060642009-01-31 22:14:21 +000010080 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010082 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010083 return NULL;
10084}
10085
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086#define FILL(kind, data, value, start, length) \
10087 do { \
10088 Py_ssize_t i_ = 0; \
10089 assert(kind != PyUnicode_WCHAR_KIND); \
10090 switch ((kind)) { \
10091 case PyUnicode_1BYTE_KIND: { \
10092 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010093 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010094 break; \
10095 } \
10096 case PyUnicode_2BYTE_KIND: { \
10097 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10098 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10099 break; \
10100 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010101 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10103 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10104 break; \
10105 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010106 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 } \
10108 } while (0)
10109
Victor Stinnerd3f08822012-05-29 12:57:52 +020010110void
10111_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10112 Py_UCS4 fill_char)
10113{
10114 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10115 const void *data = PyUnicode_DATA(unicode);
10116 assert(PyUnicode_IS_READY(unicode));
10117 assert(unicode_modifiable(unicode));
10118 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10119 assert(start >= 0);
10120 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10121 FILL(kind, data, fill_char, start, length);
10122}
10123
Victor Stinner3fe55312012-01-04 00:33:50 +010010124Py_ssize_t
10125PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10126 Py_UCS4 fill_char)
10127{
10128 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010129
10130 if (!PyUnicode_Check(unicode)) {
10131 PyErr_BadInternalCall();
10132 return -1;
10133 }
10134 if (PyUnicode_READY(unicode) == -1)
10135 return -1;
10136 if (unicode_check_modifiable(unicode))
10137 return -1;
10138
Victor Stinnerd3f08822012-05-29 12:57:52 +020010139 if (start < 0) {
10140 PyErr_SetString(PyExc_IndexError, "string index out of range");
10141 return -1;
10142 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010143 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10144 PyErr_SetString(PyExc_ValueError,
10145 "fill character is bigger than "
10146 "the string maximum character");
10147 return -1;
10148 }
10149
10150 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10151 length = Py_MIN(maxlen, length);
10152 if (length <= 0)
10153 return 0;
10154
Victor Stinnerd3f08822012-05-29 12:57:52 +020010155 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010156 return length;
10157}
10158
Victor Stinner9310abb2011-10-05 00:59:23 +020010159static PyObject *
10160pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010161 Py_ssize_t left,
10162 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 PyObject *u;
10166 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010167 int kind;
10168 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169
10170 if (left < 0)
10171 left = 0;
10172 if (right < 0)
10173 right = 0;
10174
Victor Stinnerc4b49542011-12-11 22:44:26 +010010175 if (left == 0 && right == 0)
10176 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10179 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010180 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10181 return NULL;
10182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010184 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010186 if (!u)
10187 return NULL;
10188
10189 kind = PyUnicode_KIND(u);
10190 data = PyUnicode_DATA(u);
10191 if (left)
10192 FILL(kind, data, fill, 0, left);
10193 if (right)
10194 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010195 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010196 assert(_PyUnicode_CheckConsistency(u, 1));
10197 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198}
10199
Alexander Belopolsky40018472011-02-26 01:02:56 +000010200PyObject *
10201PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010202{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010205 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010206 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207
Benjamin Petersonead6b532011-12-20 17:23:42 -060010208 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010210 if (PyUnicode_IS_ASCII(string))
10211 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010212 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010213 PyUnicode_GET_LENGTH(string), keepends);
10214 else
10215 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010216 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010217 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 break;
10219 case PyUnicode_2BYTE_KIND:
10220 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010221 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 PyUnicode_GET_LENGTH(string), keepends);
10223 break;
10224 case PyUnicode_4BYTE_KIND:
10225 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010226 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 PyUnicode_GET_LENGTH(string), keepends);
10228 break;
10229 default:
10230 assert(0);
10231 list = 0;
10232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010233 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234}
10235
Alexander Belopolsky40018472011-02-26 01:02:56 +000010236static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010237split(PyObject *self,
10238 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010239 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010241 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 void *buf1, *buf2;
10243 Py_ssize_t len1, len2;
10244 PyObject* out;
10245
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010247 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 if (PyUnicode_READY(self) == -1)
10250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010253 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010255 if (PyUnicode_IS_ASCII(self))
10256 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010257 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010258 PyUnicode_GET_LENGTH(self), maxcount
10259 );
10260 else
10261 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010262 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010263 PyUnicode_GET_LENGTH(self), maxcount
10264 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 case PyUnicode_2BYTE_KIND:
10266 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010267 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 PyUnicode_GET_LENGTH(self), maxcount
10269 );
10270 case PyUnicode_4BYTE_KIND:
10271 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010272 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 PyUnicode_GET_LENGTH(self), maxcount
10274 );
10275 default:
10276 assert(0);
10277 return NULL;
10278 }
10279
10280 if (PyUnicode_READY(substring) == -1)
10281 return NULL;
10282
10283 kind1 = PyUnicode_KIND(self);
10284 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 len1 = PyUnicode_GET_LENGTH(self);
10286 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010287 if (kind1 < kind2 || len1 < len2) {
10288 out = PyList_New(1);
10289 if (out == NULL)
10290 return NULL;
10291 Py_INCREF(self);
10292 PyList_SET_ITEM(out, 0, self);
10293 return out;
10294 }
10295 buf1 = PyUnicode_DATA(self);
10296 buf2 = PyUnicode_DATA(substring);
10297 if (kind2 != kind1) {
10298 buf2 = _PyUnicode_AsKind(substring, kind1);
10299 if (!buf2)
10300 return NULL;
10301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010303 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010305 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10306 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010307 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010308 else
10309 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010310 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 break;
10312 case PyUnicode_2BYTE_KIND:
10313 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010314 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 break;
10316 case PyUnicode_4BYTE_KIND:
10317 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010318 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 break;
10320 default:
10321 out = NULL;
10322 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010323 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 PyMem_Free(buf2);
10325 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326}
10327
Alexander Belopolsky40018472011-02-26 01:02:56 +000010328static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010329rsplit(PyObject *self,
10330 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010331 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010332{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010333 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 void *buf1, *buf2;
10335 Py_ssize_t len1, len2;
10336 PyObject* out;
10337
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010338 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010339 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 if (PyUnicode_READY(self) == -1)
10342 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010345 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010347 if (PyUnicode_IS_ASCII(self))
10348 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010349 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010350 PyUnicode_GET_LENGTH(self), maxcount
10351 );
10352 else
10353 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010354 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010355 PyUnicode_GET_LENGTH(self), maxcount
10356 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 case PyUnicode_2BYTE_KIND:
10358 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010359 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 PyUnicode_GET_LENGTH(self), maxcount
10361 );
10362 case PyUnicode_4BYTE_KIND:
10363 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010364 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 PyUnicode_GET_LENGTH(self), maxcount
10366 );
10367 default:
10368 assert(0);
10369 return NULL;
10370 }
10371
10372 if (PyUnicode_READY(substring) == -1)
10373 return NULL;
10374
10375 kind1 = PyUnicode_KIND(self);
10376 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 len1 = PyUnicode_GET_LENGTH(self);
10378 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010379 if (kind1 < kind2 || len1 < len2) {
10380 out = PyList_New(1);
10381 if (out == NULL)
10382 return NULL;
10383 Py_INCREF(self);
10384 PyList_SET_ITEM(out, 0, self);
10385 return out;
10386 }
10387 buf1 = PyUnicode_DATA(self);
10388 buf2 = PyUnicode_DATA(substring);
10389 if (kind2 != kind1) {
10390 buf2 = _PyUnicode_AsKind(substring, kind1);
10391 if (!buf2)
10392 return NULL;
10393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010395 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010397 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10398 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010399 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010400 else
10401 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010402 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 break;
10404 case PyUnicode_2BYTE_KIND:
10405 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010406 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 break;
10408 case PyUnicode_4BYTE_KIND:
10409 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010410 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411 break;
10412 default:
10413 out = NULL;
10414 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010415 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 PyMem_Free(buf2);
10417 return out;
10418}
10419
10420static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010421anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10422 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010424 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010426 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10427 return asciilib_find(buf1, len1, buf2, len2, offset);
10428 else
10429 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 case PyUnicode_2BYTE_KIND:
10431 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10432 case PyUnicode_4BYTE_KIND:
10433 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10434 }
10435 assert(0);
10436 return -1;
10437}
10438
10439static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010440anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10441 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010443 switch (kind) {
10444 case PyUnicode_1BYTE_KIND:
10445 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10446 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10447 else
10448 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10449 case PyUnicode_2BYTE_KIND:
10450 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10451 case PyUnicode_4BYTE_KIND:
10452 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10453 }
10454 assert(0);
10455 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010456}
10457
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010458static void
10459replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10460 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10461{
10462 int kind = PyUnicode_KIND(u);
10463 void *data = PyUnicode_DATA(u);
10464 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10465 if (kind == PyUnicode_1BYTE_KIND) {
10466 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10467 (Py_UCS1 *)data + len,
10468 u1, u2, maxcount);
10469 }
10470 else if (kind == PyUnicode_2BYTE_KIND) {
10471 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10472 (Py_UCS2 *)data + len,
10473 u1, u2, maxcount);
10474 }
10475 else {
10476 assert(kind == PyUnicode_4BYTE_KIND);
10477 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10478 (Py_UCS4 *)data + len,
10479 u1, u2, maxcount);
10480 }
10481}
10482
Alexander Belopolsky40018472011-02-26 01:02:56 +000010483static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484replace(PyObject *self, PyObject *str1,
10485 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 PyObject *u;
10488 char *sbuf = PyUnicode_DATA(self);
10489 char *buf1 = PyUnicode_DATA(str1);
10490 char *buf2 = PyUnicode_DATA(str2);
10491 int srelease = 0, release1 = 0, release2 = 0;
10492 int skind = PyUnicode_KIND(self);
10493 int kind1 = PyUnicode_KIND(str1);
10494 int kind2 = PyUnicode_KIND(str2);
10495 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10496 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10497 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010498 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010499 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500
10501 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010502 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010504 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505
Victor Stinner59de0ee2011-10-07 10:01:28 +020010506 if (str1 == str2)
10507 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508
Victor Stinner49a0a212011-10-12 23:46:10 +020010509 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010510 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10511 if (maxchar < maxchar_str1)
10512 /* substring too wide to be present */
10513 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010514 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10515 /* Replacing str1 with str2 may cause a maxchar reduction in the
10516 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010517 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010518 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010521 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010523 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010525 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010526 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010527 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010528
Victor Stinner69ed0f42013-04-09 21:48:24 +020010529 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010530 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010531 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010532 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010533 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010535 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010537
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010538 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10539 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010540 }
10541 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 int rkind = skind;
10543 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010544 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 if (kind1 < rkind) {
10547 /* widen substring */
10548 buf1 = _PyUnicode_AsKind(str1, rkind);
10549 if (!buf1) goto error;
10550 release1 = 1;
10551 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010552 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010553 if (i < 0)
10554 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 if (rkind > kind2) {
10556 /* widen replacement */
10557 buf2 = _PyUnicode_AsKind(str2, rkind);
10558 if (!buf2) goto error;
10559 release2 = 1;
10560 }
10561 else if (rkind < kind2) {
10562 /* widen self and buf1 */
10563 rkind = kind2;
10564 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010565 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 sbuf = _PyUnicode_AsKind(self, rkind);
10567 if (!sbuf) goto error;
10568 srelease = 1;
10569 buf1 = _PyUnicode_AsKind(str1, rkind);
10570 if (!buf1) goto error;
10571 release1 = 1;
10572 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010573 u = PyUnicode_New(slen, maxchar);
10574 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010576 assert(PyUnicode_KIND(u) == rkind);
10577 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010578
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010579 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010580 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010581 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010583 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010585
10586 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010587 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010588 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010589 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010590 if (i == -1)
10591 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010592 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010594 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010598 }
10599 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010601 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 int rkind = skind;
10603 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010606 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 buf1 = _PyUnicode_AsKind(str1, rkind);
10608 if (!buf1) goto error;
10609 release1 = 1;
10610 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010611 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010612 if (n == 0)
10613 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010615 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 buf2 = _PyUnicode_AsKind(str2, rkind);
10617 if (!buf2) goto error;
10618 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010621 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 rkind = kind2;
10623 sbuf = _PyUnicode_AsKind(self, rkind);
10624 if (!sbuf) goto error;
10625 srelease = 1;
10626 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010627 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 buf1 = _PyUnicode_AsKind(str1, rkind);
10629 if (!buf1) goto error;
10630 release1 = 1;
10631 }
10632 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10633 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010634 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 PyErr_SetString(PyExc_OverflowError,
10636 "replace string is too long");
10637 goto error;
10638 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010639 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010640 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010641 _Py_INCREF_UNICODE_EMPTY();
10642 if (!unicode_empty)
10643 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010644 u = unicode_empty;
10645 goto done;
10646 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010647 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 PyErr_SetString(PyExc_OverflowError,
10649 "replace string is too long");
10650 goto error;
10651 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010652 u = PyUnicode_New(new_size, maxchar);
10653 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010655 assert(PyUnicode_KIND(u) == rkind);
10656 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 ires = i = 0;
10658 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010659 while (n-- > 0) {
10660 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010661 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010662 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010663 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010664 if (j == -1)
10665 break;
10666 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010667 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010668 memcpy(res + rkind * ires,
10669 sbuf + rkind * i,
10670 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010672 }
10673 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010675 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010677 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010683 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010684 memcpy(res + rkind * ires,
10685 sbuf + rkind * i,
10686 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010687 }
10688 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010689 /* interleave */
10690 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010691 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010693 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010695 if (--n <= 0)
10696 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010697 memcpy(res + rkind * ires,
10698 sbuf + rkind * i,
10699 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 ires++;
10701 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010702 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010703 memcpy(res + rkind * ires,
10704 sbuf + rkind * i,
10705 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010706 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010707 }
10708
10709 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010710 unicode_adjust_maxchar(&u);
10711 if (u == NULL)
10712 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010714
10715 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 if (srelease)
10717 PyMem_FREE(sbuf);
10718 if (release1)
10719 PyMem_FREE(buf1);
10720 if (release2)
10721 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010722 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010724
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010726 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 if (srelease)
10728 PyMem_FREE(sbuf);
10729 if (release1)
10730 PyMem_FREE(buf1);
10731 if (release2)
10732 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010733 return unicode_result_unchanged(self);
10734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 error:
10736 if (srelease && sbuf)
10737 PyMem_FREE(sbuf);
10738 if (release1 && buf1)
10739 PyMem_FREE(buf1);
10740 if (release2 && buf2)
10741 PyMem_FREE(buf2);
10742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743}
10744
10745/* --- Unicode Object Methods --------------------------------------------- */
10746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010747PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010748 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749\n\
10750Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010751characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752
10753static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010754unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010756 if (PyUnicode_READY(self) == -1)
10757 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010758 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759}
10760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010761PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010762 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763\n\
10764Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010765have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766
10767static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010768unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010770 if (PyUnicode_READY(self) == -1)
10771 return NULL;
10772 if (PyUnicode_GET_LENGTH(self) == 0)
10773 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010774 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775}
10776
Benjamin Petersond5890c82012-01-14 13:23:30 -050010777PyDoc_STRVAR(casefold__doc__,
10778 "S.casefold() -> str\n\
10779\n\
10780Return a version of S suitable for caseless comparisons.");
10781
10782static PyObject *
10783unicode_casefold(PyObject *self)
10784{
10785 if (PyUnicode_READY(self) == -1)
10786 return NULL;
10787 if (PyUnicode_IS_ASCII(self))
10788 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010789 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010790}
10791
10792
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010793/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010794
10795static int
10796convert_uc(PyObject *obj, void *addr)
10797{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010799
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010800 if (!PyUnicode_Check(obj)) {
10801 PyErr_Format(PyExc_TypeError,
10802 "The fill character must be a unicode character, "
10803 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010804 return 0;
10805 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010806 if (PyUnicode_READY(obj) < 0)
10807 return 0;
10808 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010809 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010810 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010811 return 0;
10812 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010813 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010814 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010815}
10816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010817PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010818 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010820Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010821done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822
10823static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010824unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010826 Py_ssize_t marg, left;
10827 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 Py_UCS4 fillchar = ' ';
10829
Victor Stinnere9a29352011-10-01 02:14:59 +020010830 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832
Benjamin Petersonbac79492012-01-14 13:34:47 -050010833 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834 return NULL;
10835
Victor Stinnerc4b49542011-12-11 22:44:26 +010010836 if (PyUnicode_GET_LENGTH(self) >= width)
10837 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838
Victor Stinnerc4b49542011-12-11 22:44:26 +010010839 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840 left = marg / 2 + (marg & width & 1);
10841
Victor Stinner9310abb2011-10-05 00:59:23 +020010842 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843}
10844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845/* This function assumes that str1 and str2 are readied by the caller. */
10846
Marc-André Lemburge5034372000-08-08 08:04:29 +000010847static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010848unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010849{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010850#define COMPARE(TYPE1, TYPE2) \
10851 do { \
10852 TYPE1* p1 = (TYPE1 *)data1; \
10853 TYPE2* p2 = (TYPE2 *)data2; \
10854 TYPE1* end = p1 + len; \
10855 Py_UCS4 c1, c2; \
10856 for (; p1 != end; p1++, p2++) { \
10857 c1 = *p1; \
10858 c2 = *p2; \
10859 if (c1 != c2) \
10860 return (c1 < c2) ? -1 : 1; \
10861 } \
10862 } \
10863 while (0)
10864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 int kind1, kind2;
10866 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010867 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010868
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 kind1 = PyUnicode_KIND(str1);
10870 kind2 = PyUnicode_KIND(str2);
10871 data1 = PyUnicode_DATA(str1);
10872 data2 = PyUnicode_DATA(str2);
10873 len1 = PyUnicode_GET_LENGTH(str1);
10874 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010875 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010876
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010877 switch(kind1) {
10878 case PyUnicode_1BYTE_KIND:
10879 {
10880 switch(kind2) {
10881 case PyUnicode_1BYTE_KIND:
10882 {
10883 int cmp = memcmp(data1, data2, len);
10884 /* normalize result of memcmp() into the range [-1; 1] */
10885 if (cmp < 0)
10886 return -1;
10887 if (cmp > 0)
10888 return 1;
10889 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010890 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010891 case PyUnicode_2BYTE_KIND:
10892 COMPARE(Py_UCS1, Py_UCS2);
10893 break;
10894 case PyUnicode_4BYTE_KIND:
10895 COMPARE(Py_UCS1, Py_UCS4);
10896 break;
10897 default:
10898 assert(0);
10899 }
10900 break;
10901 }
10902 case PyUnicode_2BYTE_KIND:
10903 {
10904 switch(kind2) {
10905 case PyUnicode_1BYTE_KIND:
10906 COMPARE(Py_UCS2, Py_UCS1);
10907 break;
10908 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010909 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010910 COMPARE(Py_UCS2, Py_UCS2);
10911 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010912 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010913 case PyUnicode_4BYTE_KIND:
10914 COMPARE(Py_UCS2, Py_UCS4);
10915 break;
10916 default:
10917 assert(0);
10918 }
10919 break;
10920 }
10921 case PyUnicode_4BYTE_KIND:
10922 {
10923 switch(kind2) {
10924 case PyUnicode_1BYTE_KIND:
10925 COMPARE(Py_UCS4, Py_UCS1);
10926 break;
10927 case PyUnicode_2BYTE_KIND:
10928 COMPARE(Py_UCS4, Py_UCS2);
10929 break;
10930 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010931 {
10932#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10933 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10934 /* normalize result of wmemcmp() into the range [-1; 1] */
10935 if (cmp < 0)
10936 return -1;
10937 if (cmp > 0)
10938 return 1;
10939#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010940 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010941#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010942 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010943 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010944 default:
10945 assert(0);
10946 }
10947 break;
10948 }
10949 default:
10950 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010951 }
10952
Victor Stinner770e19e2012-10-04 22:59:45 +020010953 if (len1 == len2)
10954 return 0;
10955 if (len1 < len2)
10956 return -1;
10957 else
10958 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010959
10960#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010961}
10962
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010963Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010964unicode_compare_eq(PyObject *str1, PyObject *str2)
10965{
10966 int kind;
10967 void *data1, *data2;
10968 Py_ssize_t len;
10969 int cmp;
10970
Victor Stinnere5567ad2012-10-23 02:48:49 +020010971 len = PyUnicode_GET_LENGTH(str1);
10972 if (PyUnicode_GET_LENGTH(str2) != len)
10973 return 0;
10974 kind = PyUnicode_KIND(str1);
10975 if (PyUnicode_KIND(str2) != kind)
10976 return 0;
10977 data1 = PyUnicode_DATA(str1);
10978 data2 = PyUnicode_DATA(str2);
10979
10980 cmp = memcmp(data1, data2, len * kind);
10981 return (cmp == 0);
10982}
10983
10984
Alexander Belopolsky40018472011-02-26 01:02:56 +000010985int
10986PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10989 if (PyUnicode_READY(left) == -1 ||
10990 PyUnicode_READY(right) == -1)
10991 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010992
10993 /* a string is equal to itself */
10994 if (left == right)
10995 return 0;
10996
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010997 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010999 PyErr_Format(PyExc_TypeError,
11000 "Can't compare %.100s and %.100s",
11001 left->ob_type->tp_name,
11002 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003 return -1;
11004}
11005
Martin v. Löwis5b222132007-06-10 09:51:05 +000011006int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010011007_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
11008{
11009 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
11010 if (right_str == NULL)
11011 return -1;
11012 return PyUnicode_Compare(left, right_str);
11013}
11014
11015int
Martin v. Löwis5b222132007-06-10 09:51:05 +000011016PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018 Py_ssize_t i;
11019 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 Py_UCS4 chr;
11021
Victor Stinner910337b2011-10-03 03:20:16 +020011022 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023 if (PyUnicode_READY(uni) == -1)
11024 return -1;
11025 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011026 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011027 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011028 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011029 size_t len, len2 = strlen(str);
11030 int cmp;
11031
11032 len = Py_MIN(len1, len2);
11033 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011034 if (cmp != 0) {
11035 if (cmp < 0)
11036 return -1;
11037 else
11038 return 1;
11039 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011040 if (len1 > len2)
11041 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011042 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011043 return -1; /* str is longer */
11044 return 0;
11045 }
11046 else {
11047 void *data = PyUnicode_DATA(uni);
11048 /* Compare Unicode string and source character set string */
11049 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011050 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011051 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11052 /* This check keeps Python strings that end in '\0' from comparing equal
11053 to C strings identical up to that point. */
11054 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11055 return 1; /* uni is longer */
11056 if (str[i])
11057 return -1; /* str is longer */
11058 return 0;
11059 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011060}
11061
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011062
Benjamin Peterson29060642009-01-31 22:14:21 +000011063#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011064 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011065
Alexander Belopolsky40018472011-02-26 01:02:56 +000011066PyObject *
11067PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011068{
11069 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011070 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011071
Victor Stinnere5567ad2012-10-23 02:48:49 +020011072 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11073 Py_RETURN_NOTIMPLEMENTED;
11074
11075 if (PyUnicode_READY(left) == -1 ||
11076 PyUnicode_READY(right) == -1)
11077 return NULL;
11078
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011079 if (left == right) {
11080 switch (op) {
11081 case Py_EQ:
11082 case Py_LE:
11083 case Py_GE:
11084 /* a string is equal to itself */
11085 v = Py_True;
11086 break;
11087 case Py_NE:
11088 case Py_LT:
11089 case Py_GT:
11090 v = Py_False;
11091 break;
11092 default:
11093 PyErr_BadArgument();
11094 return NULL;
11095 }
11096 }
11097 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011098 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011099 result ^= (op == Py_NE);
11100 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011101 }
11102 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011103 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011104
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011105 /* Convert the return value to a Boolean */
11106 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011107 case Py_LE:
11108 v = TEST_COND(result <= 0);
11109 break;
11110 case Py_GE:
11111 v = TEST_COND(result >= 0);
11112 break;
11113 case Py_LT:
11114 v = TEST_COND(result == -1);
11115 break;
11116 case Py_GT:
11117 v = TEST_COND(result == 1);
11118 break;
11119 default:
11120 PyErr_BadArgument();
11121 return NULL;
11122 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011123 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011124 Py_INCREF(v);
11125 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011126}
11127
Alexander Belopolsky40018472011-02-26 01:02:56 +000011128int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011129_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11130{
11131 return unicode_eq(aa, bb);
11132}
11133
11134int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011135PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011136{
Victor Stinner77282cb2013-04-14 19:22:47 +020011137 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138 void *buf1, *buf2;
11139 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011140 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011141
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011142 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011143 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011144 "'in <string>' requires string as left operand, not %.100s",
11145 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011146 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011147 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011148 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011149 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011150 if (ensure_unicode(str) < 0)
11151 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011154 kind2 = PyUnicode_KIND(substr);
11155 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011156 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011158 len2 = PyUnicode_GET_LENGTH(substr);
11159 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011160 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011161 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011162 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011163 if (len2 == 1) {
11164 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11165 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011166 return result;
11167 }
11168 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011169 buf2 = _PyUnicode_AsKind(substr, kind1);
11170 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011171 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173
Victor Stinner77282cb2013-04-14 19:22:47 +020011174 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 case PyUnicode_1BYTE_KIND:
11176 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11177 break;
11178 case PyUnicode_2BYTE_KIND:
11179 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11180 break;
11181 case PyUnicode_4BYTE_KIND:
11182 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11183 break;
11184 default:
11185 result = -1;
11186 assert(0);
11187 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011188
Victor Stinner77282cb2013-04-14 19:22:47 +020011189 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 PyMem_Free(buf2);
11191
Guido van Rossum403d68b2000-03-13 15:55:09 +000011192 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011193}
11194
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195/* Concat to string or Unicode object giving a new Unicode object. */
11196
Alexander Belopolsky40018472011-02-26 01:02:56 +000011197PyObject *
11198PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011200 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011201 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011202 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011204 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11205 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206
11207 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011208 if (left == unicode_empty)
11209 return PyUnicode_FromObject(right);
11210 if (right == unicode_empty)
11211 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011213 left_len = PyUnicode_GET_LENGTH(left);
11214 right_len = PyUnicode_GET_LENGTH(right);
11215 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011216 PyErr_SetString(PyExc_OverflowError,
11217 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011218 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011219 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011220 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011221
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011222 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11223 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011224 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011227 result = PyUnicode_New(new_len, maxchar);
11228 if (result == NULL)
11229 return NULL;
11230 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11231 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11232 assert(_PyUnicode_CheckConsistency(result, 1));
11233 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234}
11235
Walter Dörwald1ab83302007-05-18 17:15:44 +000011236void
Victor Stinner23e56682011-10-03 03:54:37 +020011237PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011238{
Victor Stinner23e56682011-10-03 03:54:37 +020011239 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011240 Py_UCS4 maxchar, maxchar2;
11241 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011242
11243 if (p_left == NULL) {
11244 if (!PyErr_Occurred())
11245 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011246 return;
11247 }
Victor Stinner23e56682011-10-03 03:54:37 +020011248 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011249 if (right == NULL || left == NULL
11250 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011251 if (!PyErr_Occurred())
11252 PyErr_BadInternalCall();
11253 goto error;
11254 }
11255
Benjamin Petersonbac79492012-01-14 13:34:47 -050011256 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011257 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011258 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011259 goto error;
11260
Victor Stinner488fa492011-12-12 00:01:39 +010011261 /* Shortcuts */
11262 if (left == unicode_empty) {
11263 Py_DECREF(left);
11264 Py_INCREF(right);
11265 *p_left = right;
11266 return;
11267 }
11268 if (right == unicode_empty)
11269 return;
11270
11271 left_len = PyUnicode_GET_LENGTH(left);
11272 right_len = PyUnicode_GET_LENGTH(right);
11273 if (left_len > PY_SSIZE_T_MAX - right_len) {
11274 PyErr_SetString(PyExc_OverflowError,
11275 "strings are too large to concat");
11276 goto error;
11277 }
11278 new_len = left_len + right_len;
11279
11280 if (unicode_modifiable(left)
11281 && PyUnicode_CheckExact(right)
11282 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011283 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11284 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011285 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011286 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011287 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11288 {
11289 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011290 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011291 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011292
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011293 /* copy 'right' into the newly allocated area of 'left' */
11294 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011295 }
Victor Stinner488fa492011-12-12 00:01:39 +010011296 else {
11297 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11298 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011299 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011300
Victor Stinner488fa492011-12-12 00:01:39 +010011301 /* Concat the two Unicode strings */
11302 res = PyUnicode_New(new_len, maxchar);
11303 if (res == NULL)
11304 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011305 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11306 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011307 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011308 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011309 }
11310 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011311 return;
11312
11313error:
Victor Stinner488fa492011-12-12 00:01:39 +010011314 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011315}
11316
11317void
11318PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11319{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011320 PyUnicode_Append(pleft, right);
11321 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011322}
11323
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011324/*
11325Wraps stringlib_parse_args_finds() and additionally ensures that the
11326first argument is a unicode object.
11327*/
11328
11329Py_LOCAL_INLINE(int)
11330parse_args_finds_unicode(const char * function_name, PyObject *args,
11331 PyObject **substring,
11332 Py_ssize_t *start, Py_ssize_t *end)
11333{
11334 if(stringlib_parse_args_finds(function_name, args, substring,
11335 start, end)) {
11336 if (ensure_unicode(*substring) < 0)
11337 return 0;
11338 return 1;
11339 }
11340 return 0;
11341}
11342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011343PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011344 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011346Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011347string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011348interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349
11350static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011351unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011353 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011354 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011355 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011357 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 void *buf1, *buf2;
11359 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011361 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011362 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364 kind1 = PyUnicode_KIND(self);
11365 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011366 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011367 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 len1 = PyUnicode_GET_LENGTH(self);
11370 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011371 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011372 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011373 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011374
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011375 buf1 = PyUnicode_DATA(self);
11376 buf2 = PyUnicode_DATA(substring);
11377 if (kind2 != kind1) {
11378 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011379 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011380 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011381 }
11382 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011383 case PyUnicode_1BYTE_KIND:
11384 iresult = ucs1lib_count(
11385 ((Py_UCS1*)buf1) + start, end - start,
11386 buf2, len2, PY_SSIZE_T_MAX
11387 );
11388 break;
11389 case PyUnicode_2BYTE_KIND:
11390 iresult = ucs2lib_count(
11391 ((Py_UCS2*)buf1) + start, end - start,
11392 buf2, len2, PY_SSIZE_T_MAX
11393 );
11394 break;
11395 case PyUnicode_4BYTE_KIND:
11396 iresult = ucs4lib_count(
11397 ((Py_UCS4*)buf1) + start, end - start,
11398 buf2, len2, PY_SSIZE_T_MAX
11399 );
11400 break;
11401 default:
11402 assert(0); iresult = 0;
11403 }
11404
11405 result = PyLong_FromSsize_t(iresult);
11406
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011407 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410 return result;
11411}
11412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011413PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011414 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011416Encode S using the codec registered for encoding. Default encoding\n\
11417is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011418handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011419a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11420'xmlcharrefreplace' as well as any other name registered with\n\
11421codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422
11423static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011424unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011426 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427 char *encoding = NULL;
11428 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011429
Benjamin Peterson308d6372009-09-18 21:42:35 +000011430 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11431 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011433 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011434}
11435
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011436PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011437 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438\n\
11439Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011440If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441
11442static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011443unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011445 Py_ssize_t i, j, line_pos, src_len, incr;
11446 Py_UCS4 ch;
11447 PyObject *u;
11448 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011449 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011451 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011452 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453
Ezio Melotti745d54d2013-11-16 19:10:57 +020011454 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11455 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011456 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457
Antoine Pitrou22425222011-10-04 19:10:51 +020011458 if (PyUnicode_READY(self) == -1)
11459 return NULL;
11460
Thomas Wouters7e474022000-07-16 12:04:32 +000011461 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011462 src_len = PyUnicode_GET_LENGTH(self);
11463 i = j = line_pos = 0;
11464 kind = PyUnicode_KIND(self);
11465 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011466 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011467 for (; i < src_len; i++) {
11468 ch = PyUnicode_READ(kind, src_data, i);
11469 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011470 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011471 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011472 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011474 goto overflow;
11475 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011476 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011477 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011478 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011481 goto overflow;
11482 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011484 if (ch == '\n' || ch == '\r')
11485 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011487 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011488 if (!found)
11489 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011490
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011492 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493 if (!u)
11494 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011495 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496
Antoine Pitroue71d5742011-10-04 15:55:09 +020011497 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498
Antoine Pitroue71d5742011-10-04 15:55:09 +020011499 for (; i < src_len; i++) {
11500 ch = PyUnicode_READ(kind, src_data, i);
11501 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011502 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011503 incr = tabsize - (line_pos % tabsize);
11504 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011505 FILL(kind, dest_data, ' ', j, incr);
11506 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011508 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011510 line_pos++;
11511 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011512 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011513 if (ch == '\n' || ch == '\r')
11514 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011516 }
11517 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011518 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011519
Antoine Pitroue71d5742011-10-04 15:55:09 +020011520 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011521 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523}
11524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011525PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011526 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527\n\
11528Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011529such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530arguments start and end are interpreted as in slice notation.\n\
11531\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011532Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533
11534static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011537 /* initialize variables to prevent gcc warning */
11538 PyObject *substring = NULL;
11539 Py_ssize_t start = 0;
11540 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011541 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011543 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011546 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011549 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 if (result == -2)
11552 return NULL;
11553
Christian Heimes217cfd12007-12-02 14:31:20 +000011554 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555}
11556
11557static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011558unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011560 void *data;
11561 enum PyUnicode_Kind kind;
11562 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011563
11564 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11565 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011567 }
11568 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11569 PyErr_SetString(PyExc_IndexError, "string index out of range");
11570 return NULL;
11571 }
11572 kind = PyUnicode_KIND(self);
11573 data = PyUnicode_DATA(self);
11574 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011575 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576}
11577
Guido van Rossumc2504932007-09-18 19:42:40 +000011578/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011579 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011580static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011581unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582{
Guido van Rossumc2504932007-09-18 19:42:40 +000011583 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011584 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011585
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011586#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011587 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011588#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589 if (_PyUnicode_HASH(self) != -1)
11590 return _PyUnicode_HASH(self);
11591 if (PyUnicode_READY(self) == -1)
11592 return -1;
11593 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011594 /*
11595 We make the hash of the empty string be 0, rather than using
11596 (prefix ^ suffix), since this slightly obfuscates the hash secret
11597 */
11598 if (len == 0) {
11599 _PyUnicode_HASH(self) = 0;
11600 return 0;
11601 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011602 x = _Py_HashBytes(PyUnicode_DATA(self),
11603 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011605 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606}
11607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011608PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011609 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011611Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612
11613static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011616 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011617 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011618 PyObject *substring = NULL;
11619 Py_ssize_t start = 0;
11620 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011622 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011625 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011628 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011630 if (result == -2)
11631 return NULL;
11632
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633 if (result < 0) {
11634 PyErr_SetString(PyExc_ValueError, "substring not found");
11635 return NULL;
11636 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011637
Christian Heimes217cfd12007-12-02 14:31:20 +000011638 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639}
11640
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011641PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011644Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011645at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646
11647static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011648unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 Py_ssize_t i, length;
11651 int kind;
11652 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653 int cased;
11654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 if (PyUnicode_READY(self) == -1)
11656 return NULL;
11657 length = PyUnicode_GET_LENGTH(self);
11658 kind = PyUnicode_KIND(self);
11659 data = PyUnicode_DATA(self);
11660
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 if (length == 1)
11663 return PyBool_FromLong(
11664 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011666 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011668 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011669
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671 for (i = 0; i < length; i++) {
11672 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011673
Benjamin Peterson29060642009-01-31 22:14:21 +000011674 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11675 return PyBool_FromLong(0);
11676 else if (!cased && Py_UNICODE_ISLOWER(ch))
11677 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011679 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680}
11681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011682PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011683 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011685Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011686at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687
11688static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011689unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011691 Py_ssize_t i, length;
11692 int kind;
11693 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694 int cased;
11695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 if (PyUnicode_READY(self) == -1)
11697 return NULL;
11698 length = PyUnicode_GET_LENGTH(self);
11699 kind = PyUnicode_KIND(self);
11700 data = PyUnicode_DATA(self);
11701
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 if (length == 1)
11704 return PyBool_FromLong(
11705 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011707 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011708 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011709 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011710
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 for (i = 0; i < length; i++) {
11713 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011714
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11716 return PyBool_FromLong(0);
11717 else if (!cased && Py_UNICODE_ISUPPER(ch))
11718 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011720 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721}
11722
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011723PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011724 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011726Return True if S is a titlecased string and there is at least one\n\
11727character in S, i.e. upper- and titlecase characters may only\n\
11728follow uncased characters and lowercase characters only cased ones.\n\
11729Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730
11731static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011732unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 Py_ssize_t i, length;
11735 int kind;
11736 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737 int cased, previous_is_cased;
11738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 if (PyUnicode_READY(self) == -1)
11740 return NULL;
11741 length = PyUnicode_GET_LENGTH(self);
11742 kind = PyUnicode_KIND(self);
11743 data = PyUnicode_DATA(self);
11744
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 if (length == 1) {
11747 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11748 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11749 (Py_UNICODE_ISUPPER(ch) != 0));
11750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011752 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011754 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011755
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756 cased = 0;
11757 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 for (i = 0; i < length; i++) {
11759 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011760
Benjamin Peterson29060642009-01-31 22:14:21 +000011761 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11762 if (previous_is_cased)
11763 return PyBool_FromLong(0);
11764 previous_is_cased = 1;
11765 cased = 1;
11766 }
11767 else if (Py_UNICODE_ISLOWER(ch)) {
11768 if (!previous_is_cased)
11769 return PyBool_FromLong(0);
11770 previous_is_cased = 1;
11771 cased = 1;
11772 }
11773 else
11774 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011776 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777}
11778
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011779PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011780 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011782Return True if all characters in S are whitespace\n\
11783and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784
11785static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011786unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 Py_ssize_t i, length;
11789 int kind;
11790 void *data;
11791
11792 if (PyUnicode_READY(self) == -1)
11793 return NULL;
11794 length = PyUnicode_GET_LENGTH(self);
11795 kind = PyUnicode_KIND(self);
11796 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 if (length == 1)
11800 return PyBool_FromLong(
11801 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011803 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011805 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 for (i = 0; i < length; i++) {
11808 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011809 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011812 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813}
11814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011815PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011816 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011817\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011818Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011819and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011820
11821static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011822unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011823{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 Py_ssize_t i, length;
11825 int kind;
11826 void *data;
11827
11828 if (PyUnicode_READY(self) == -1)
11829 return NULL;
11830 length = PyUnicode_GET_LENGTH(self);
11831 kind = PyUnicode_KIND(self);
11832 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011833
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011834 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 if (length == 1)
11836 return PyBool_FromLong(
11837 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011838
11839 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 for (i = 0; i < length; i++) {
11844 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011845 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011846 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011847 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011848}
11849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011850PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011851 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011852\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011853Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011854and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011855
11856static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011857unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011858{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 int kind;
11860 void *data;
11861 Py_ssize_t len, i;
11862
11863 if (PyUnicode_READY(self) == -1)
11864 return NULL;
11865
11866 kind = PyUnicode_KIND(self);
11867 data = PyUnicode_DATA(self);
11868 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011869
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011870 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 if (len == 1) {
11872 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11873 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11874 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011875
11876 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011878 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011880 for (i = 0; i < len; i++) {
11881 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011882 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011884 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011885 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011886}
11887
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011888PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011889 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011891Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011892False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893
11894static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011895unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 Py_ssize_t i, length;
11898 int kind;
11899 void *data;
11900
11901 if (PyUnicode_READY(self) == -1)
11902 return NULL;
11903 length = PyUnicode_GET_LENGTH(self);
11904 kind = PyUnicode_KIND(self);
11905 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 if (length == 1)
11909 return PyBool_FromLong(
11910 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011912 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011914 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 for (i = 0; i < length; i++) {
11917 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011918 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011920 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921}
11922
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011923PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011924 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011926Return True if all characters in S are digits\n\
11927and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928
11929static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011930unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011932 Py_ssize_t i, length;
11933 int kind;
11934 void *data;
11935
11936 if (PyUnicode_READY(self) == -1)
11937 return NULL;
11938 length = PyUnicode_GET_LENGTH(self);
11939 kind = PyUnicode_KIND(self);
11940 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 if (length == 1) {
11944 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11945 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11946 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011948 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011949 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011950 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 for (i = 0; i < length; i++) {
11953 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011956 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957}
11958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011959PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011960 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011962Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011963False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964
11965static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011966unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 Py_ssize_t i, length;
11969 int kind;
11970 void *data;
11971
11972 if (PyUnicode_READY(self) == -1)
11973 return NULL;
11974 length = PyUnicode_GET_LENGTH(self);
11975 kind = PyUnicode_KIND(self);
11976 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 if (length == 1)
11980 return PyBool_FromLong(
11981 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011983 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011985 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987 for (i = 0; i < length; i++) {
11988 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011991 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992}
11993
Martin v. Löwis47383402007-08-15 07:32:56 +000011994int
11995PyUnicode_IsIdentifier(PyObject *self)
11996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 int kind;
11998 void *data;
11999 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012000 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 if (PyUnicode_READY(self) == -1) {
12003 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012004 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 }
12006
12007 /* Special case for empty strings */
12008 if (PyUnicode_GET_LENGTH(self) == 0)
12009 return 0;
12010 kind = PyUnicode_KIND(self);
12011 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012012
12013 /* PEP 3131 says that the first character must be in
12014 XID_Start and subsequent characters in XID_Continue,
12015 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012016 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012017 letters, digits, underscore). However, given the current
12018 definition of XID_Start and XID_Continue, it is sufficient
12019 to check just for these, except that _ must be allowed
12020 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012022 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012023 return 0;
12024
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012025 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012027 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012028 return 1;
12029}
12030
12031PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012032 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000012033\n\
12034Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012035to the language definition.\n\
12036\n\
12037Use keyword.iskeyword() to test for reserved identifiers\n\
12038such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012039
12040static PyObject*
12041unicode_isidentifier(PyObject *self)
12042{
12043 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12044}
12045
Georg Brandl559e5d72008-06-11 18:37:52 +000012046PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012047 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012048\n\
12049Return True if all characters in S are considered\n\
12050printable in repr() or S is empty, False otherwise.");
12051
12052static PyObject*
12053unicode_isprintable(PyObject *self)
12054{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055 Py_ssize_t i, length;
12056 int kind;
12057 void *data;
12058
12059 if (PyUnicode_READY(self) == -1)
12060 return NULL;
12061 length = PyUnicode_GET_LENGTH(self);
12062 kind = PyUnicode_KIND(self);
12063 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012064
12065 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 if (length == 1)
12067 return PyBool_FromLong(
12068 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 for (i = 0; i < length; i++) {
12071 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012072 Py_RETURN_FALSE;
12073 }
12074 }
12075 Py_RETURN_TRUE;
12076}
12077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012078PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012079 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080\n\
12081Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012082iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083
12084static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012085unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012087 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088}
12089
Martin v. Löwis18e16552006-02-15 17:27:45 +000012090static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012091unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 if (PyUnicode_READY(self) == -1)
12094 return -1;
12095 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096}
12097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012098PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012099 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012101Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012102done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103
12104static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012105unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012107 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 Py_UCS4 fillchar = ' ';
12109
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012110 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111 return NULL;
12112
Benjamin Petersonbac79492012-01-14 13:34:47 -050012113 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012114 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115
Victor Stinnerc4b49542011-12-11 22:44:26 +010012116 if (PyUnicode_GET_LENGTH(self) >= width)
12117 return unicode_result_unchanged(self);
12118
12119 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120}
12121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012122PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012123 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012125Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126
12127static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012128unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012130 if (PyUnicode_READY(self) == -1)
12131 return NULL;
12132 if (PyUnicode_IS_ASCII(self))
12133 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012134 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135}
12136
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012137#define LEFTSTRIP 0
12138#define RIGHTSTRIP 1
12139#define BOTHSTRIP 2
12140
12141/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012142static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012143
12144#define STRIPNAME(i) (stripformat[i]+3)
12145
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012146/* externally visible for str.strip(unicode) */
12147PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012148_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012149{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 void *data;
12151 int kind;
12152 Py_ssize_t i, j, len;
12153 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012154 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012156 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12157 return NULL;
12158
12159 kind = PyUnicode_KIND(self);
12160 data = PyUnicode_DATA(self);
12161 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012162 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012163 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12164 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012165 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012166
Benjamin Peterson14339b62009-01-31 16:36:08 +000012167 i = 0;
12168 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012169 while (i < len) {
12170 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12171 if (!BLOOM(sepmask, ch))
12172 break;
12173 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12174 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012175 i++;
12176 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012177 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012178
Benjamin Peterson14339b62009-01-31 16:36:08 +000012179 j = len;
12180 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012181 j--;
12182 while (j >= i) {
12183 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12184 if (!BLOOM(sepmask, ch))
12185 break;
12186 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12187 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012188 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012189 }
12190
Benjamin Peterson29060642009-01-31 22:14:21 +000012191 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012192 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012193
Victor Stinner7931d9a2011-11-04 00:22:48 +010012194 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195}
12196
12197PyObject*
12198PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12199{
12200 unsigned char *data;
12201 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012202 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203
Victor Stinnerde636f32011-10-01 03:55:54 +020012204 if (PyUnicode_READY(self) == -1)
12205 return NULL;
12206
Victor Stinner684d5fd2012-05-03 02:32:34 +020012207 length = PyUnicode_GET_LENGTH(self);
12208 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012209
Victor Stinner684d5fd2012-05-03 02:32:34 +020012210 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012211 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212
Victor Stinnerde636f32011-10-01 03:55:54 +020012213 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012214 PyErr_SetString(PyExc_IndexError, "string index out of range");
12215 return NULL;
12216 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012217 if (start >= length || end < start)
12218 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012219
Victor Stinner684d5fd2012-05-03 02:32:34 +020012220 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012221 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012222 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012223 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012224 }
12225 else {
12226 kind = PyUnicode_KIND(self);
12227 data = PyUnicode_1BYTE_DATA(self);
12228 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012229 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012230 length);
12231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012232}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233
12234static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012235do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 Py_ssize_t len, i, j;
12238
12239 if (PyUnicode_READY(self) == -1)
12240 return NULL;
12241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012242 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012243
Victor Stinnercc7af722013-04-09 22:39:24 +020012244 if (PyUnicode_IS_ASCII(self)) {
12245 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12246
12247 i = 0;
12248 if (striptype != RIGHTSTRIP) {
12249 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012250 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012251 if (!_Py_ascii_whitespace[ch])
12252 break;
12253 i++;
12254 }
12255 }
12256
12257 j = len;
12258 if (striptype != LEFTSTRIP) {
12259 j--;
12260 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012261 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012262 if (!_Py_ascii_whitespace[ch])
12263 break;
12264 j--;
12265 }
12266 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012267 }
12268 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012269 else {
12270 int kind = PyUnicode_KIND(self);
12271 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012272
Victor Stinnercc7af722013-04-09 22:39:24 +020012273 i = 0;
12274 if (striptype != RIGHTSTRIP) {
12275 while (i < len) {
12276 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12277 if (!Py_UNICODE_ISSPACE(ch))
12278 break;
12279 i++;
12280 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012281 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012282
12283 j = len;
12284 if (striptype != LEFTSTRIP) {
12285 j--;
12286 while (j >= i) {
12287 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12288 if (!Py_UNICODE_ISSPACE(ch))
12289 break;
12290 j--;
12291 }
12292 j++;
12293 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012294 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012295
Victor Stinner7931d9a2011-11-04 00:22:48 +010012296 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297}
12298
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012299
12300static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012301do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012302{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012303 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012304
Serhiy Storchakac6792272013-10-19 21:03:34 +030012305 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012306 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012307
Benjamin Peterson14339b62009-01-31 16:36:08 +000012308 if (sep != NULL && sep != Py_None) {
12309 if (PyUnicode_Check(sep))
12310 return _PyUnicode_XStrip(self, striptype, sep);
12311 else {
12312 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012313 "%s arg must be None or str",
12314 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012315 return NULL;
12316 }
12317 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012318
Benjamin Peterson14339b62009-01-31 16:36:08 +000012319 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012320}
12321
12322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012323PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012324 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012325\n\
12326Return a copy of the string S with leading and trailing\n\
12327whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012328If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012329
12330static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012331unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012332{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012333 if (PyTuple_GET_SIZE(args) == 0)
12334 return do_strip(self, BOTHSTRIP); /* Common case */
12335 else
12336 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012337}
12338
12339
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012340PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012341 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012342\n\
12343Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012344If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012345
12346static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012347unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012348{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012349 if (PyTuple_GET_SIZE(args) == 0)
12350 return do_strip(self, LEFTSTRIP); /* Common case */
12351 else
12352 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012353}
12354
12355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012356PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012357 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012358\n\
12359Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012360If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012361
12362static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012363unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012364{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012365 if (PyTuple_GET_SIZE(args) == 0)
12366 return do_strip(self, RIGHTSTRIP); /* Common case */
12367 else
12368 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012369}
12370
12371
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012373unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012374{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012375 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377
Serhiy Storchaka05997252013-01-26 12:14:02 +020012378 if (len < 1)
12379 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380
Victor Stinnerc4b49542011-12-11 22:44:26 +010012381 /* no repeat, return original string */
12382 if (len == 1)
12383 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012384
Benjamin Petersonbac79492012-01-14 13:34:47 -050012385 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386 return NULL;
12387
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012388 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012389 PyErr_SetString(PyExc_OverflowError,
12390 "repeated string is too long");
12391 return NULL;
12392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012394
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012395 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012396 if (!u)
12397 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012398 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012400 if (PyUnicode_GET_LENGTH(str) == 1) {
12401 const int kind = PyUnicode_KIND(str);
12402 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012403 if (kind == PyUnicode_1BYTE_KIND) {
12404 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012405 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012406 }
12407 else if (kind == PyUnicode_2BYTE_KIND) {
12408 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012409 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012410 ucs2[n] = fill_char;
12411 } else {
12412 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12413 assert(kind == PyUnicode_4BYTE_KIND);
12414 for (n = 0; n < len; ++n)
12415 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012416 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 }
12418 else {
12419 /* number of characters copied this far */
12420 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012421 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012422 char *to = (char *) PyUnicode_DATA(u);
12423 Py_MEMCPY(to, PyUnicode_DATA(str),
12424 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012425 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426 n = (done <= nchars-done) ? done : nchars-done;
12427 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012428 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012430 }
12431
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012432 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012433 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012434}
12435
Alexander Belopolsky40018472011-02-26 01:02:56 +000012436PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012437PyUnicode_Replace(PyObject *str,
12438 PyObject *substr,
12439 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012440 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012441{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012442 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12443 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012444 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012445 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012446}
12447
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012448PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012449 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012450\n\
12451Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012452old replaced by new. If the optional argument count is\n\
12453given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012454
12455static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 PyObject *str1;
12459 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012460 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012462 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012464 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012465 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012466 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467}
12468
Alexander Belopolsky40018472011-02-26 01:02:56 +000012469static PyObject *
12470unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012471{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012472 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 Py_ssize_t isize;
12474 Py_ssize_t osize, squote, dquote, i, o;
12475 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012476 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012480 return NULL;
12481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 isize = PyUnicode_GET_LENGTH(unicode);
12483 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012485 /* Compute length of output, quote characters, and
12486 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012487 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012488 max = 127;
12489 squote = dquote = 0;
12490 ikind = PyUnicode_KIND(unicode);
12491 for (i = 0; i < isize; i++) {
12492 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012493 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012495 case '\'': squote++; break;
12496 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012497 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012498 incr = 2;
12499 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500 default:
12501 /* Fast-path ASCII */
12502 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012503 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012504 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012505 ;
12506 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012508 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012509 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012510 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012511 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012513 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012515 if (osize > PY_SSIZE_T_MAX - incr) {
12516 PyErr_SetString(PyExc_OverflowError,
12517 "string is too long to generate repr");
12518 return NULL;
12519 }
12520 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521 }
12522
12523 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012524 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012526 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012527 if (dquote)
12528 /* Both squote and dquote present. Use squote,
12529 and escape them */
12530 osize += squote;
12531 else
12532 quote = '"';
12533 }
Victor Stinner55c08782013-04-14 18:45:39 +020012534 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535
12536 repr = PyUnicode_New(osize, max);
12537 if (repr == NULL)
12538 return NULL;
12539 okind = PyUnicode_KIND(repr);
12540 odata = PyUnicode_DATA(repr);
12541
12542 PyUnicode_WRITE(okind, odata, 0, quote);
12543 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012544 if (unchanged) {
12545 _PyUnicode_FastCopyCharacters(repr, 1,
12546 unicode, 0,
12547 isize);
12548 }
12549 else {
12550 for (i = 0, o = 1; i < isize; i++) {
12551 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552
Victor Stinner55c08782013-04-14 18:45:39 +020012553 /* Escape quotes and backslashes */
12554 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012555 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012557 continue;
12558 }
12559
12560 /* Map special whitespace to '\t', \n', '\r' */
12561 if (ch == '\t') {
12562 PyUnicode_WRITE(okind, odata, o++, '\\');
12563 PyUnicode_WRITE(okind, odata, o++, 't');
12564 }
12565 else if (ch == '\n') {
12566 PyUnicode_WRITE(okind, odata, o++, '\\');
12567 PyUnicode_WRITE(okind, odata, o++, 'n');
12568 }
12569 else if (ch == '\r') {
12570 PyUnicode_WRITE(okind, odata, o++, '\\');
12571 PyUnicode_WRITE(okind, odata, o++, 'r');
12572 }
12573
12574 /* Map non-printable US ASCII to '\xhh' */
12575 else if (ch < ' ' || ch == 0x7F) {
12576 PyUnicode_WRITE(okind, odata, o++, '\\');
12577 PyUnicode_WRITE(okind, odata, o++, 'x');
12578 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12579 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12580 }
12581
12582 /* Copy ASCII characters as-is */
12583 else if (ch < 0x7F) {
12584 PyUnicode_WRITE(okind, odata, o++, ch);
12585 }
12586
12587 /* Non-ASCII characters */
12588 else {
12589 /* Map Unicode whitespace and control characters
12590 (categories Z* and C* except ASCII space)
12591 */
12592 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12593 PyUnicode_WRITE(okind, odata, o++, '\\');
12594 /* Map 8-bit characters to '\xhh' */
12595 if (ch <= 0xff) {
12596 PyUnicode_WRITE(okind, odata, o++, 'x');
12597 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12598 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12599 }
12600 /* Map 16-bit characters to '\uxxxx' */
12601 else if (ch <= 0xffff) {
12602 PyUnicode_WRITE(okind, odata, o++, 'u');
12603 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12604 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12605 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12606 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12607 }
12608 /* Map 21-bit characters to '\U00xxxxxx' */
12609 else {
12610 PyUnicode_WRITE(okind, odata, o++, 'U');
12611 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12612 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12613 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12614 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12615 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12616 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12617 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12618 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12619 }
12620 }
12621 /* Copy characters as-is */
12622 else {
12623 PyUnicode_WRITE(okind, odata, o++, ch);
12624 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012625 }
12626 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012629 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012630 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631}
12632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012633PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012634 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635\n\
12636Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012637such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638arguments start and end are interpreted as in slice notation.\n\
12639\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012640Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641
12642static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012645 /* initialize variables to prevent gcc warning */
12646 PyObject *substring = NULL;
12647 Py_ssize_t start = 0;
12648 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012649 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012651 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012652 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012653
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012654 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012657 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012658
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 if (result == -2)
12660 return NULL;
12661
Christian Heimes217cfd12007-12-02 14:31:20 +000012662 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663}
12664
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012665PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012666 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012668Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669
12670static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012673 /* initialize variables to prevent gcc warning */
12674 PyObject *substring = NULL;
12675 Py_ssize_t start = 0;
12676 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012677 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012679 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012680 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012681
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012682 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012685 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012687 if (result == -2)
12688 return NULL;
12689
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690 if (result < 0) {
12691 PyErr_SetString(PyExc_ValueError, "substring not found");
12692 return NULL;
12693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694
Christian Heimes217cfd12007-12-02 14:31:20 +000012695 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696}
12697
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012698PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012699 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012701Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012702done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703
12704static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012705unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012707 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708 Py_UCS4 fillchar = ' ';
12709
Victor Stinnere9a29352011-10-01 02:14:59 +020012710 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012712
Benjamin Petersonbac79492012-01-14 13:34:47 -050012713 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714 return NULL;
12715
Victor Stinnerc4b49542011-12-11 22:44:26 +010012716 if (PyUnicode_GET_LENGTH(self) >= width)
12717 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012718
Victor Stinnerc4b49542011-12-11 22:44:26 +010012719 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720}
12721
Alexander Belopolsky40018472011-02-26 01:02:56 +000012722PyObject *
12723PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012724{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012725 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012726 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012727
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012728 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729}
12730
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012731PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012732 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733\n\
12734Return a list of the words in S, using sep as the\n\
12735delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012736splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012737whitespace string is a separator and empty strings are\n\
12738removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012739
12740static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012741unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012742{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012743 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012744 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012745 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012746
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012747 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12748 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012749 return NULL;
12750
12751 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012752 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012753
12754 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012755 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012756
12757 PyErr_Format(PyExc_TypeError,
12758 "must be str or None, not %.100s",
12759 Py_TYPE(substring)->tp_name);
12760 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012761}
12762
Thomas Wouters477c8d52006-05-27 19:21:47 +000012763PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012764PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012765{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012766 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012767 int kind1, kind2;
12768 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012770
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012771 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012772 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012773
Victor Stinner14f8f022011-10-05 20:58:25 +020012774 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012775 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 len1 = PyUnicode_GET_LENGTH(str_obj);
12777 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012778 if (kind1 < kind2 || len1 < len2) {
12779 _Py_INCREF_UNICODE_EMPTY();
12780 if (!unicode_empty)
12781 out = NULL;
12782 else {
12783 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12784 Py_DECREF(unicode_empty);
12785 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012786 return out;
12787 }
12788 buf1 = PyUnicode_DATA(str_obj);
12789 buf2 = PyUnicode_DATA(sep_obj);
12790 if (kind2 != kind1) {
12791 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12792 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012793 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012794 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012796 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012797 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012798 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12799 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12800 else
12801 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012802 break;
12803 case PyUnicode_2BYTE_KIND:
12804 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12805 break;
12806 case PyUnicode_4BYTE_KIND:
12807 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12808 break;
12809 default:
12810 assert(0);
12811 out = 0;
12812 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012813
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012814 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012815 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012816
12817 return out;
12818}
12819
12820
12821PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012822PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012823{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012824 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012825 int kind1, kind2;
12826 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012828
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012829 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012830 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012831
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012832 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012833 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012834 len1 = PyUnicode_GET_LENGTH(str_obj);
12835 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012836 if (kind1 < kind2 || len1 < len2) {
12837 _Py_INCREF_UNICODE_EMPTY();
12838 if (!unicode_empty)
12839 out = NULL;
12840 else {
12841 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12842 Py_DECREF(unicode_empty);
12843 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012844 return out;
12845 }
12846 buf1 = PyUnicode_DATA(str_obj);
12847 buf2 = PyUnicode_DATA(sep_obj);
12848 if (kind2 != kind1) {
12849 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12850 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012851 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012852 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012853
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012854 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012855 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012856 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12857 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12858 else
12859 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012860 break;
12861 case PyUnicode_2BYTE_KIND:
12862 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12863 break;
12864 case PyUnicode_4BYTE_KIND:
12865 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12866 break;
12867 default:
12868 assert(0);
12869 out = 0;
12870 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012871
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012872 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012873 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012874
12875 return out;
12876}
12877
12878PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012879 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012880\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012881Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012882the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012883found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012884
12885static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012886unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012887{
Victor Stinner9310abb2011-10-05 00:59:23 +020012888 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012889}
12890
12891PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012892 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012893\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012894Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012895the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012896separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012897
12898static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012899unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012900{
Victor Stinner9310abb2011-10-05 00:59:23 +020012901 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012902}
12903
Alexander Belopolsky40018472011-02-26 01:02:56 +000012904PyObject *
12905PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012906{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012907 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012908 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012909
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012910 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012911}
12912
12913PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012914 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012915\n\
12916Return a list of the words in S, using sep as the\n\
12917delimiter string, starting at the end of the string and\n\
12918working to the front. If maxsplit is given, at most maxsplit\n\
12919splits are done. If sep is not specified, any whitespace string\n\
12920is a separator.");
12921
12922static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012923unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012924{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012925 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012926 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012927 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012928
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012929 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12930 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012931 return NULL;
12932
12933 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012934 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012935
12936 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012937 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012938
12939 PyErr_Format(PyExc_TypeError,
12940 "must be str or None, not %.100s",
12941 Py_TYPE(substring)->tp_name);
12942 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012943}
12944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012945PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012946 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947\n\
12948Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012949Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012950is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012951
12952static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012953unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012955 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012956 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012957
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012958 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12959 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012960 return NULL;
12961
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012962 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012963}
12964
12965static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012966PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012968 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012969}
12970
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012971PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012972 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012973\n\
12974Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012975and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012976
12977static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012978unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012979{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012980 if (PyUnicode_READY(self) == -1)
12981 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012982 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012983}
12984
Larry Hastings61272b72014-01-07 12:41:53 -080012985/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012986
Larry Hastings31826802013-10-19 00:09:25 -070012987@staticmethod
12988str.maketrans as unicode_maketrans
12989
12990 x: object
12991
12992 y: unicode=NULL
12993
12994 z: unicode=NULL
12995
12996 /
12997
12998Return a translation table usable for str.translate().
12999
13000If there is only one argument, it must be a dictionary mapping Unicode
13001ordinals (integers) or characters to Unicode ordinals, strings or None.
13002Character keys will be then converted to ordinals.
13003If there are two arguments, they must be strings of equal length, and
13004in the resulting dictionary, each character in x will be mapped to the
13005character at the same position in y. If there is a third argument, it
13006must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013007[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013008
Larry Hastings31826802013-10-19 00:09:25 -070013009static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013010unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013011/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013012{
Georg Brandlceee0772007-11-27 23:48:05 +000013013 PyObject *new = NULL, *key, *value;
13014 Py_ssize_t i = 0;
13015 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013016
Georg Brandlceee0772007-11-27 23:48:05 +000013017 new = PyDict_New();
13018 if (!new)
13019 return NULL;
13020 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013021 int x_kind, y_kind, z_kind;
13022 void *x_data, *y_data, *z_data;
13023
Georg Brandlceee0772007-11-27 23:48:05 +000013024 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013025 if (!PyUnicode_Check(x)) {
13026 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13027 "be a string if there is a second argument");
13028 goto err;
13029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013030 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013031 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13032 "arguments must have equal length");
13033 goto err;
13034 }
13035 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013036 x_kind = PyUnicode_KIND(x);
13037 y_kind = PyUnicode_KIND(y);
13038 x_data = PyUnicode_DATA(x);
13039 y_data = PyUnicode_DATA(y);
13040 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13041 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013042 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013043 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013044 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013045 if (!value) {
13046 Py_DECREF(key);
13047 goto err;
13048 }
Georg Brandlceee0772007-11-27 23:48:05 +000013049 res = PyDict_SetItem(new, key, value);
13050 Py_DECREF(key);
13051 Py_DECREF(value);
13052 if (res < 0)
13053 goto err;
13054 }
13055 /* create entries for deleting chars in z */
13056 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013057 z_kind = PyUnicode_KIND(z);
13058 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013059 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013060 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013061 if (!key)
13062 goto err;
13063 res = PyDict_SetItem(new, key, Py_None);
13064 Py_DECREF(key);
13065 if (res < 0)
13066 goto err;
13067 }
13068 }
13069 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013070 int kind;
13071 void *data;
13072
Georg Brandlceee0772007-11-27 23:48:05 +000013073 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013074 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013075 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13076 "to maketrans it must be a dict");
13077 goto err;
13078 }
13079 /* copy entries into the new dict, converting string keys to int keys */
13080 while (PyDict_Next(x, &i, &key, &value)) {
13081 if (PyUnicode_Check(key)) {
13082 /* convert string keys to integer keys */
13083 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013084 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013085 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13086 "table must be of length 1");
13087 goto err;
13088 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013089 kind = PyUnicode_KIND(key);
13090 data = PyUnicode_DATA(key);
13091 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013092 if (!newkey)
13093 goto err;
13094 res = PyDict_SetItem(new, newkey, value);
13095 Py_DECREF(newkey);
13096 if (res < 0)
13097 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013098 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013099 /* just keep integer keys */
13100 if (PyDict_SetItem(new, key, value) < 0)
13101 goto err;
13102 } else {
13103 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13104 "be strings or integers");
13105 goto err;
13106 }
13107 }
13108 }
13109 return new;
13110 err:
13111 Py_DECREF(new);
13112 return NULL;
13113}
13114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013115PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013116 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013118Return a copy of the string S in which each character has been mapped\n\
13119through the given translation table. The table must implement\n\
13120lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13121mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13122this operation raises LookupError, the character is left untouched.\n\
13123Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124
13125static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013128 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129}
13130
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013131PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013132 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013134Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135
13136static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013137unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013139 if (PyUnicode_READY(self) == -1)
13140 return NULL;
13141 if (PyUnicode_IS_ASCII(self))
13142 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013143 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144}
13145
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013146PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013147 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013148\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013149Pad a numeric string S with zeros on the left, to fill a field\n\
13150of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151
13152static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013153unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013154{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013155 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013156 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013157 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013158 int kind;
13159 void *data;
13160 Py_UCS4 chr;
13161
Martin v. Löwis18e16552006-02-15 17:27:45 +000013162 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013163 return NULL;
13164
Benjamin Petersonbac79492012-01-14 13:34:47 -050013165 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013166 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013167
Victor Stinnerc4b49542011-12-11 22:44:26 +010013168 if (PyUnicode_GET_LENGTH(self) >= width)
13169 return unicode_result_unchanged(self);
13170
13171 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013172
13173 u = pad(self, fill, 0, '0');
13174
Walter Dörwald068325e2002-04-15 13:36:47 +000013175 if (u == NULL)
13176 return NULL;
13177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013178 kind = PyUnicode_KIND(u);
13179 data = PyUnicode_DATA(u);
13180 chr = PyUnicode_READ(kind, data, fill);
13181
13182 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013183 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013184 PyUnicode_WRITE(kind, data, 0, chr);
13185 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013186 }
13187
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013188 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013189 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013190}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013191
13192#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013193static PyObject *
13194unicode__decimal2ascii(PyObject *self)
13195{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013196 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013197}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198#endif
13199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013200PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013201 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013203Return True if S starts with the specified prefix, False otherwise.\n\
13204With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013205With optional end, stop comparing S at that position.\n\
13206prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207
13208static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013209unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013210 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013211{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013212 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013213 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013214 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013215 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013216 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217
Jesus Ceaac451502011-04-20 17:09:23 +020013218 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013219 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013220 if (PyTuple_Check(subobj)) {
13221 Py_ssize_t i;
13222 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013223 substring = PyTuple_GET_ITEM(subobj, i);
13224 if (!PyUnicode_Check(substring)) {
13225 PyErr_Format(PyExc_TypeError,
13226 "tuple for startswith must only contain str, "
13227 "not %.100s",
13228 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013229 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013230 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013231 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013232 if (result == -1)
13233 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013234 if (result) {
13235 Py_RETURN_TRUE;
13236 }
13237 }
13238 /* nothing matched */
13239 Py_RETURN_FALSE;
13240 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013241 if (!PyUnicode_Check(subobj)) {
13242 PyErr_Format(PyExc_TypeError,
13243 "startswith first arg must be str or "
13244 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013245 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013246 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013247 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013248 if (result == -1)
13249 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013250 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251}
13252
13253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013254PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013255 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013256\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013257Return True if S ends with the specified suffix, False otherwise.\n\
13258With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013259With optional end, stop comparing S at that position.\n\
13260suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013261
13262static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013263unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013264 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013265{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013266 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013267 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013268 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013269 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013270 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013271
Jesus Ceaac451502011-04-20 17:09:23 +020013272 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013273 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013274 if (PyTuple_Check(subobj)) {
13275 Py_ssize_t i;
13276 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013277 substring = PyTuple_GET_ITEM(subobj, i);
13278 if (!PyUnicode_Check(substring)) {
13279 PyErr_Format(PyExc_TypeError,
13280 "tuple for endswith must only contain str, "
13281 "not %.100s",
13282 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013283 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013284 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013285 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013286 if (result == -1)
13287 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013288 if (result) {
13289 Py_RETURN_TRUE;
13290 }
13291 }
13292 Py_RETURN_FALSE;
13293 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013294 if (!PyUnicode_Check(subobj)) {
13295 PyErr_Format(PyExc_TypeError,
13296 "endswith first arg must be str or "
13297 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013298 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013299 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013300 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013301 if (result == -1)
13302 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013303 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304}
13305
Victor Stinner202fdca2012-05-07 12:47:02 +020013306Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013307_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013308{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013309 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13310 writer->data = PyUnicode_DATA(writer->buffer);
13311
13312 if (!writer->readonly) {
13313 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013314 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013315 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013316 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013317 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13318 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13319 writer->kind = PyUnicode_WCHAR_KIND;
13320 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13321
Victor Stinner8f674cc2013-04-17 23:02:17 +020013322 /* Copy-on-write mode: set buffer size to 0 so
13323 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13324 * next write. */
13325 writer->size = 0;
13326 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013327}
13328
Victor Stinnerd3f08822012-05-29 12:57:52 +020013329void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013330_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013331{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013332 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013333
13334 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013335 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013336
13337 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13338 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13339 writer->kind = PyUnicode_WCHAR_KIND;
13340 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013341}
13342
Victor Stinnerd3f08822012-05-29 12:57:52 +020013343int
13344_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13345 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013346{
13347 Py_ssize_t newlen;
13348 PyObject *newbuffer;
13349
Victor Stinnerca9381e2015-09-22 00:58:32 +020013350 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013351 assert((maxchar > writer->maxchar && length >= 0)
13352 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013353
Victor Stinner202fdca2012-05-07 12:47:02 +020013354 if (length > PY_SSIZE_T_MAX - writer->pos) {
13355 PyErr_NoMemory();
13356 return -1;
13357 }
13358 newlen = writer->pos + length;
13359
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013360 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013361
Victor Stinnerd3f08822012-05-29 12:57:52 +020013362 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013363 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013364 if (writer->overallocate
13365 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13366 /* overallocate to limit the number of realloc() */
13367 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013368 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013369 if (newlen < writer->min_length)
13370 newlen = writer->min_length;
13371
Victor Stinnerd3f08822012-05-29 12:57:52 +020013372 writer->buffer = PyUnicode_New(newlen, maxchar);
13373 if (writer->buffer == NULL)
13374 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013375 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013376 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013377 if (writer->overallocate
13378 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13379 /* overallocate to limit the number of realloc() */
13380 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013381 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013382 if (newlen < writer->min_length)
13383 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013384
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013385 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013386 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013387 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013388 newbuffer = PyUnicode_New(newlen, maxchar);
13389 if (newbuffer == NULL)
13390 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013391 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13392 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013393 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013394 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013395 }
13396 else {
13397 newbuffer = resize_compact(writer->buffer, newlen);
13398 if (newbuffer == NULL)
13399 return -1;
13400 }
13401 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013402 }
13403 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013404 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013405 newbuffer = PyUnicode_New(writer->size, maxchar);
13406 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013407 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013408 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13409 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013410 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013411 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013412 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013413 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013414
13415#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013416}
13417
Victor Stinnerca9381e2015-09-22 00:58:32 +020013418int
13419_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13420 enum PyUnicode_Kind kind)
13421{
13422 Py_UCS4 maxchar;
13423
13424 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13425 assert(writer->kind < kind);
13426
13427 switch (kind)
13428 {
13429 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13430 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13431 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13432 default:
13433 assert(0 && "invalid kind");
13434 return -1;
13435 }
13436
13437 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13438}
13439
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013440Py_LOCAL_INLINE(int)
13441_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013442{
13443 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13444 return -1;
13445 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13446 writer->pos++;
13447 return 0;
13448}
13449
13450int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013451_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13452{
13453 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13454}
13455
13456int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013457_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13458{
13459 Py_UCS4 maxchar;
13460 Py_ssize_t len;
13461
13462 if (PyUnicode_READY(str) == -1)
13463 return -1;
13464 len = PyUnicode_GET_LENGTH(str);
13465 if (len == 0)
13466 return 0;
13467 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13468 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013469 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013470 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013471 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013472 Py_INCREF(str);
13473 writer->buffer = str;
13474 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013475 writer->pos += len;
13476 return 0;
13477 }
13478 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13479 return -1;
13480 }
13481 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13482 str, 0, len);
13483 writer->pos += len;
13484 return 0;
13485}
13486
Victor Stinnere215d962012-10-06 23:03:36 +020013487int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013488_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13489 Py_ssize_t start, Py_ssize_t end)
13490{
13491 Py_UCS4 maxchar;
13492 Py_ssize_t len;
13493
13494 if (PyUnicode_READY(str) == -1)
13495 return -1;
13496
13497 assert(0 <= start);
13498 assert(end <= PyUnicode_GET_LENGTH(str));
13499 assert(start <= end);
13500
13501 if (end == 0)
13502 return 0;
13503
13504 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13505 return _PyUnicodeWriter_WriteStr(writer, str);
13506
13507 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13508 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13509 else
13510 maxchar = writer->maxchar;
13511 len = end - start;
13512
13513 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13514 return -1;
13515
13516 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13517 str, start, len);
13518 writer->pos += len;
13519 return 0;
13520}
13521
13522int
Victor Stinner4a587072013-11-19 12:54:53 +010013523_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13524 const char *ascii, Py_ssize_t len)
13525{
13526 if (len == -1)
13527 len = strlen(ascii);
13528
13529 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13530
13531 if (writer->buffer == NULL && !writer->overallocate) {
13532 PyObject *str;
13533
13534 str = _PyUnicode_FromASCII(ascii, len);
13535 if (str == NULL)
13536 return -1;
13537
13538 writer->readonly = 1;
13539 writer->buffer = str;
13540 _PyUnicodeWriter_Update(writer);
13541 writer->pos += len;
13542 return 0;
13543 }
13544
13545 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13546 return -1;
13547
13548 switch (writer->kind)
13549 {
13550 case PyUnicode_1BYTE_KIND:
13551 {
13552 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13553 Py_UCS1 *data = writer->data;
13554
13555 Py_MEMCPY(data + writer->pos, str, len);
13556 break;
13557 }
13558 case PyUnicode_2BYTE_KIND:
13559 {
13560 _PyUnicode_CONVERT_BYTES(
13561 Py_UCS1, Py_UCS2,
13562 ascii, ascii + len,
13563 (Py_UCS2 *)writer->data + writer->pos);
13564 break;
13565 }
13566 case PyUnicode_4BYTE_KIND:
13567 {
13568 _PyUnicode_CONVERT_BYTES(
13569 Py_UCS1, Py_UCS4,
13570 ascii, ascii + len,
13571 (Py_UCS4 *)writer->data + writer->pos);
13572 break;
13573 }
13574 default:
13575 assert(0);
13576 }
13577
13578 writer->pos += len;
13579 return 0;
13580}
13581
13582int
13583_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13584 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013585{
13586 Py_UCS4 maxchar;
13587
13588 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13589 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13590 return -1;
13591 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13592 writer->pos += len;
13593 return 0;
13594}
13595
Victor Stinnerd3f08822012-05-29 12:57:52 +020013596PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013597_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013598{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013599 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013600 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013601 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013602 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013603 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013604 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013605 str = writer->buffer;
13606 writer->buffer = NULL;
13607 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13608 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013609 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013610 if (writer->pos == 0) {
13611 Py_CLEAR(writer->buffer);
13612
13613 /* Get the empty Unicode string singleton ('') */
13614 _Py_INCREF_UNICODE_EMPTY();
13615 str = unicode_empty;
Victor Stinner202fdca2012-05-07 12:47:02 +020013616 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013617 else {
13618 str = writer->buffer;
13619 writer->buffer = NULL;
13620
13621 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13622 PyObject *str2;
13623 str2 = resize_compact(str, writer->pos);
13624 if (str2 == NULL)
13625 return NULL;
13626 str = str2;
13627 }
13628 }
13629
Victor Stinner15a0bd32013-07-08 22:29:55 +020013630 assert(_PyUnicode_CheckConsistency(str, 1));
13631 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013632}
13633
Victor Stinnerd3f08822012-05-29 12:57:52 +020013634void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013635_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013636{
13637 Py_CLEAR(writer->buffer);
13638}
13639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013640#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013641
13642PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013643 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013644\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013645Return a formatted version of S, using substitutions from args and kwargs.\n\
13646The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013647
Eric Smith27bbca62010-11-04 17:06:58 +000013648PyDoc_STRVAR(format_map__doc__,
13649 "S.format_map(mapping) -> str\n\
13650\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013651Return a formatted version of S, using substitutions from mapping.\n\
13652The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013653
Eric Smith4a7d76d2008-05-30 18:10:19 +000013654static PyObject *
13655unicode__format__(PyObject* self, PyObject* args)
13656{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013657 PyObject *format_spec;
13658 _PyUnicodeWriter writer;
13659 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013660
13661 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13662 return NULL;
13663
Victor Stinnerd3f08822012-05-29 12:57:52 +020013664 if (PyUnicode_READY(self) == -1)
13665 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013666 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013667 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13668 self, format_spec, 0,
13669 PyUnicode_GET_LENGTH(format_spec));
13670 if (ret == -1) {
13671 _PyUnicodeWriter_Dealloc(&writer);
13672 return NULL;
13673 }
13674 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013675}
13676
Eric Smith8c663262007-08-25 02:26:07 +000013677PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013678 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013679\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013680Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013681
13682static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013683unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013684{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013685 Py_ssize_t size;
13686
13687 /* If it's a compact object, account for base structure +
13688 character data. */
13689 if (PyUnicode_IS_COMPACT_ASCII(v))
13690 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13691 else if (PyUnicode_IS_COMPACT(v))
13692 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013693 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013694 else {
13695 /* If it is a two-block object, account for base object, and
13696 for character block if present. */
13697 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013698 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013699 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013700 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013701 }
13702 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013703 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013704 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013705 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013706 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013707 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013708
13709 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013710}
13711
13712PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013713 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013714
13715static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013716unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013717{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013718 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013719 if (!copy)
13720 return NULL;
13721 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013722}
13723
Guido van Rossumd57fd912000-03-10 22:53:23 +000013724static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013725 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013726 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013727 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13728 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013729 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13730 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013731 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013732 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13733 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13734 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013735 {"expandtabs", (PyCFunction) unicode_expandtabs,
13736 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013737 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013738 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013739 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13740 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13741 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013742 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013743 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13744 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13745 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013746 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013747 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013748 {"splitlines", (PyCFunction) unicode_splitlines,
13749 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013750 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013751 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13752 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13753 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13754 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13755 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13756 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13757 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13758 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13759 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13760 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13761 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13762 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13763 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13764 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013765 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013766 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013767 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013768 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013769 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013770 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013771 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013772 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013773#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013774 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013775 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013776#endif
13777
Benjamin Peterson14339b62009-01-31 16:36:08 +000013778 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013779 {NULL, NULL}
13780};
13781
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013782static PyObject *
13783unicode_mod(PyObject *v, PyObject *w)
13784{
Brian Curtindfc80e32011-08-10 20:28:54 -050013785 if (!PyUnicode_Check(v))
13786 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013787 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013788}
13789
13790static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013791 0, /*nb_add*/
13792 0, /*nb_subtract*/
13793 0, /*nb_multiply*/
13794 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013795};
13796
Guido van Rossumd57fd912000-03-10 22:53:23 +000013797static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013798 (lenfunc) unicode_length, /* sq_length */
13799 PyUnicode_Concat, /* sq_concat */
13800 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13801 (ssizeargfunc) unicode_getitem, /* sq_item */
13802 0, /* sq_slice */
13803 0, /* sq_ass_item */
13804 0, /* sq_ass_slice */
13805 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013806};
13807
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013808static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013809unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013810{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013811 if (PyUnicode_READY(self) == -1)
13812 return NULL;
13813
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013814 if (PyIndex_Check(item)) {
13815 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013816 if (i == -1 && PyErr_Occurred())
13817 return NULL;
13818 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013819 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013820 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013821 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013822 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013823 PyObject *result;
13824 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013825 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013826 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013828 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013829 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013830 return NULL;
13831 }
13832
13833 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013834 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013835 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013836 slicelength == PyUnicode_GET_LENGTH(self)) {
13837 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013838 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013839 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013840 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013841 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013842 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013843 src_kind = PyUnicode_KIND(self);
13844 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013845 if (!PyUnicode_IS_ASCII(self)) {
13846 kind_limit = kind_maxchar_limit(src_kind);
13847 max_char = 0;
13848 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13849 ch = PyUnicode_READ(src_kind, src_data, cur);
13850 if (ch > max_char) {
13851 max_char = ch;
13852 if (max_char >= kind_limit)
13853 break;
13854 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013855 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013856 }
Victor Stinner55c99112011-10-13 01:17:06 +020013857 else
13858 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013859 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013860 if (result == NULL)
13861 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013862 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013863 dest_data = PyUnicode_DATA(result);
13864
13865 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013866 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13867 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013868 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013869 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013870 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013871 } else {
13872 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13873 return NULL;
13874 }
13875}
13876
13877static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013878 (lenfunc)unicode_length, /* mp_length */
13879 (binaryfunc)unicode_subscript, /* mp_subscript */
13880 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013881};
13882
Guido van Rossumd57fd912000-03-10 22:53:23 +000013883
Guido van Rossumd57fd912000-03-10 22:53:23 +000013884/* Helpers for PyUnicode_Format() */
13885
Victor Stinnera47082312012-10-04 02:19:54 +020013886struct unicode_formatter_t {
13887 PyObject *args;
13888 int args_owned;
13889 Py_ssize_t arglen, argidx;
13890 PyObject *dict;
13891
13892 enum PyUnicode_Kind fmtkind;
13893 Py_ssize_t fmtcnt, fmtpos;
13894 void *fmtdata;
13895 PyObject *fmtstr;
13896
13897 _PyUnicodeWriter writer;
13898};
13899
13900struct unicode_format_arg_t {
13901 Py_UCS4 ch;
13902 int flags;
13903 Py_ssize_t width;
13904 int prec;
13905 int sign;
13906};
13907
Guido van Rossumd57fd912000-03-10 22:53:23 +000013908static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013909unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013910{
Victor Stinnera47082312012-10-04 02:19:54 +020013911 Py_ssize_t argidx = ctx->argidx;
13912
13913 if (argidx < ctx->arglen) {
13914 ctx->argidx++;
13915 if (ctx->arglen < 0)
13916 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013917 else
Victor Stinnera47082312012-10-04 02:19:54 +020013918 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013919 }
13920 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013921 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013922 return NULL;
13923}
13924
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013925/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013926
Victor Stinnera47082312012-10-04 02:19:54 +020013927/* Format a float into the writer if the writer is not NULL, or into *p_output
13928 otherwise.
13929
13930 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013931static int
Victor Stinnera47082312012-10-04 02:19:54 +020013932formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13933 PyObject **p_output,
13934 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013935{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013936 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013937 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013938 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013939 int prec;
13940 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013941
Guido van Rossumd57fd912000-03-10 22:53:23 +000013942 x = PyFloat_AsDouble(v);
13943 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013944 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013945
Victor Stinnera47082312012-10-04 02:19:54 +020013946 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013947 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013948 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013949
Victor Stinnera47082312012-10-04 02:19:54 +020013950 if (arg->flags & F_ALT)
13951 dtoa_flags = Py_DTSF_ALT;
13952 else
13953 dtoa_flags = 0;
13954 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013955 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013956 return -1;
13957 len = strlen(p);
13958 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013959 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013960 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013961 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013962 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013963 }
13964 else
13965 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013966 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013967 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013968}
13969
Victor Stinnerd0880d52012-04-27 23:40:13 +020013970/* formatlong() emulates the format codes d, u, o, x and X, and
13971 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13972 * Python's regular ints.
13973 * Return value: a new PyUnicodeObject*, or NULL if error.
13974 * The output string is of the form
13975 * "-"? ("0x" | "0X")? digit+
13976 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13977 * set in flags. The case of hex digits will be correct,
13978 * There will be at least prec digits, zero-filled on the left if
13979 * necessary to get that many.
13980 * val object to be converted
13981 * flags bitmask of format flags; only F_ALT is looked at
13982 * prec minimum number of digits; 0-fill on left if needed
13983 * type a character in [duoxX]; u acts the same as d
13984 *
13985 * CAUTION: o, x and X conversions on regular ints can never
13986 * produce a '-' sign, but can for Python's unbounded ints.
13987 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013988PyObject *
13989_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013990{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013991 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013992 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013993 Py_ssize_t i;
13994 int sign; /* 1 if '-', else 0 */
13995 int len; /* number of characters */
13996 Py_ssize_t llen;
13997 int numdigits; /* len == numnondigits + numdigits */
13998 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013999
Victor Stinnerd0880d52012-04-27 23:40:13 +020014000 /* Avoid exceeding SSIZE_T_MAX */
14001 if (prec > INT_MAX-3) {
14002 PyErr_SetString(PyExc_OverflowError,
14003 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014004 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014005 }
14006
14007 assert(PyLong_Check(val));
14008
14009 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014010 default:
14011 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014012 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014013 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014014 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014015 /* int and int subclasses should print numerically when a numeric */
14016 /* format code is used (see issue18780) */
14017 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014018 break;
14019 case 'o':
14020 numnondigits = 2;
14021 result = PyNumber_ToBase(val, 8);
14022 break;
14023 case 'x':
14024 case 'X':
14025 numnondigits = 2;
14026 result = PyNumber_ToBase(val, 16);
14027 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014028 }
14029 if (!result)
14030 return NULL;
14031
14032 assert(unicode_modifiable(result));
14033 assert(PyUnicode_IS_READY(result));
14034 assert(PyUnicode_IS_ASCII(result));
14035
14036 /* To modify the string in-place, there can only be one reference. */
14037 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014038 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014039 PyErr_BadInternalCall();
14040 return NULL;
14041 }
14042 buf = PyUnicode_DATA(result);
14043 llen = PyUnicode_GET_LENGTH(result);
14044 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014045 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014046 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014047 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014048 return NULL;
14049 }
14050 len = (int)llen;
14051 sign = buf[0] == '-';
14052 numnondigits += sign;
14053 numdigits = len - numnondigits;
14054 assert(numdigits > 0);
14055
14056 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014057 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014058 (type == 'o' || type == 'x' || type == 'X'))) {
14059 assert(buf[sign] == '0');
14060 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14061 buf[sign+1] == 'o');
14062 numnondigits -= 2;
14063 buf += 2;
14064 len -= 2;
14065 if (sign)
14066 buf[0] = '-';
14067 assert(len == numnondigits + numdigits);
14068 assert(numdigits > 0);
14069 }
14070
14071 /* Fill with leading zeroes to meet minimum width. */
14072 if (prec > numdigits) {
14073 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14074 numnondigits + prec);
14075 char *b1;
14076 if (!r1) {
14077 Py_DECREF(result);
14078 return NULL;
14079 }
14080 b1 = PyBytes_AS_STRING(r1);
14081 for (i = 0; i < numnondigits; ++i)
14082 *b1++ = *buf++;
14083 for (i = 0; i < prec - numdigits; i++)
14084 *b1++ = '0';
14085 for (i = 0; i < numdigits; i++)
14086 *b1++ = *buf++;
14087 *b1 = '\0';
14088 Py_DECREF(result);
14089 result = r1;
14090 buf = PyBytes_AS_STRING(result);
14091 len = numnondigits + prec;
14092 }
14093
14094 /* Fix up case for hex conversions. */
14095 if (type == 'X') {
14096 /* Need to convert all lower case letters to upper case.
14097 and need to convert 0x to 0X (and -0x to -0X). */
14098 for (i = 0; i < len; i++)
14099 if (buf[i] >= 'a' && buf[i] <= 'x')
14100 buf[i] -= 'a'-'A';
14101 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014102 if (!PyUnicode_Check(result)
14103 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014104 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014105 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014106 Py_DECREF(result);
14107 result = unicode;
14108 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014109 else if (len != PyUnicode_GET_LENGTH(result)) {
14110 if (PyUnicode_Resize(&result, len) < 0)
14111 Py_CLEAR(result);
14112 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014113 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014114}
14115
Ethan Furmandf3ed242014-01-05 06:50:30 -080014116/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014117 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014118 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014119 * -1 and raise an exception on error */
14120static int
Victor Stinnera47082312012-10-04 02:19:54 +020014121mainformatlong(PyObject *v,
14122 struct unicode_format_arg_t *arg,
14123 PyObject **p_output,
14124 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014125{
14126 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014127 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014128
14129 if (!PyNumber_Check(v))
14130 goto wrongtype;
14131
Ethan Furman9ab74802014-03-21 06:38:46 -070014132 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014133 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014134 if (type == 'o' || type == 'x' || type == 'X') {
14135 iobj = PyNumber_Index(v);
14136 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014137 if (PyErr_ExceptionMatches(PyExc_TypeError))
14138 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014139 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014140 }
14141 }
14142 else {
14143 iobj = PyNumber_Long(v);
14144 if (iobj == NULL ) {
14145 if (PyErr_ExceptionMatches(PyExc_TypeError))
14146 goto wrongtype;
14147 return -1;
14148 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014149 }
14150 assert(PyLong_Check(iobj));
14151 }
14152 else {
14153 iobj = v;
14154 Py_INCREF(iobj);
14155 }
14156
14157 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014158 && arg->width == -1 && arg->prec == -1
14159 && !(arg->flags & (F_SIGN | F_BLANK))
14160 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014161 {
14162 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014163 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014164 int base;
14165
Victor Stinnera47082312012-10-04 02:19:54 +020014166 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014167 {
14168 default:
14169 assert(0 && "'type' not in [diuoxX]");
14170 case 'd':
14171 case 'i':
14172 case 'u':
14173 base = 10;
14174 break;
14175 case 'o':
14176 base = 8;
14177 break;
14178 case 'x':
14179 case 'X':
14180 base = 16;
14181 break;
14182 }
14183
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014184 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14185 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014186 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014187 }
14188 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014189 return 1;
14190 }
14191
Ethan Furmanb95b5612015-01-23 20:05:18 -080014192 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014193 Py_DECREF(iobj);
14194 if (res == NULL)
14195 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014196 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014197 return 0;
14198
14199wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014200 switch(type)
14201 {
14202 case 'o':
14203 case 'x':
14204 case 'X':
14205 PyErr_Format(PyExc_TypeError,
14206 "%%%c format: an integer is required, "
14207 "not %.200s",
14208 type, Py_TYPE(v)->tp_name);
14209 break;
14210 default:
14211 PyErr_Format(PyExc_TypeError,
14212 "%%%c format: a number is required, "
14213 "not %.200s",
14214 type, Py_TYPE(v)->tp_name);
14215 break;
14216 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014217 return -1;
14218}
14219
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014220static Py_UCS4
14221formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014222{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014223 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014224 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014225 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014226 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014227 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014228 goto onError;
14229 }
14230 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014231 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014232 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014233 /* make sure number is a type of integer */
14234 if (!PyLong_Check(v)) {
14235 iobj = PyNumber_Index(v);
14236 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014237 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014238 }
14239 v = iobj;
14240 Py_DECREF(iobj);
14241 }
14242 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014243 x = PyLong_AsLong(v);
14244 if (x == -1 && PyErr_Occurred())
14245 goto onError;
14246
Victor Stinner8faf8212011-12-08 22:14:11 +010014247 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014248 PyErr_SetString(PyExc_OverflowError,
14249 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014250 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014251 }
14252
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014253 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014254 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014255
Benjamin Peterson29060642009-01-31 22:14:21 +000014256 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014257 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014258 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014259 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014260}
14261
Victor Stinnera47082312012-10-04 02:19:54 +020014262/* Parse options of an argument: flags, width, precision.
14263 Handle also "%(name)" syntax.
14264
14265 Return 0 if the argument has been formatted into arg->str.
14266 Return 1 if the argument has been written into ctx->writer,
14267 Raise an exception and return -1 on error. */
14268static int
14269unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14270 struct unicode_format_arg_t *arg)
14271{
14272#define FORMAT_READ(ctx) \
14273 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14274
14275 PyObject *v;
14276
Victor Stinnera47082312012-10-04 02:19:54 +020014277 if (arg->ch == '(') {
14278 /* Get argument value from a dictionary. Example: "%(name)s". */
14279 Py_ssize_t keystart;
14280 Py_ssize_t keylen;
14281 PyObject *key;
14282 int pcount = 1;
14283
14284 if (ctx->dict == NULL) {
14285 PyErr_SetString(PyExc_TypeError,
14286 "format requires a mapping");
14287 return -1;
14288 }
14289 ++ctx->fmtpos;
14290 --ctx->fmtcnt;
14291 keystart = ctx->fmtpos;
14292 /* Skip over balanced parentheses */
14293 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14294 arg->ch = FORMAT_READ(ctx);
14295 if (arg->ch == ')')
14296 --pcount;
14297 else if (arg->ch == '(')
14298 ++pcount;
14299 ctx->fmtpos++;
14300 }
14301 keylen = ctx->fmtpos - keystart - 1;
14302 if (ctx->fmtcnt < 0 || pcount > 0) {
14303 PyErr_SetString(PyExc_ValueError,
14304 "incomplete format key");
14305 return -1;
14306 }
14307 key = PyUnicode_Substring(ctx->fmtstr,
14308 keystart, keystart + keylen);
14309 if (key == NULL)
14310 return -1;
14311 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014312 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014313 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014314 }
14315 ctx->args = PyObject_GetItem(ctx->dict, key);
14316 Py_DECREF(key);
14317 if (ctx->args == NULL)
14318 return -1;
14319 ctx->args_owned = 1;
14320 ctx->arglen = -1;
14321 ctx->argidx = -2;
14322 }
14323
14324 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014325 while (--ctx->fmtcnt >= 0) {
14326 arg->ch = FORMAT_READ(ctx);
14327 ctx->fmtpos++;
14328 switch (arg->ch) {
14329 case '-': arg->flags |= F_LJUST; continue;
14330 case '+': arg->flags |= F_SIGN; continue;
14331 case ' ': arg->flags |= F_BLANK; continue;
14332 case '#': arg->flags |= F_ALT; continue;
14333 case '0': arg->flags |= F_ZERO; continue;
14334 }
14335 break;
14336 }
14337
14338 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014339 if (arg->ch == '*') {
14340 v = unicode_format_getnextarg(ctx);
14341 if (v == NULL)
14342 return -1;
14343 if (!PyLong_Check(v)) {
14344 PyErr_SetString(PyExc_TypeError,
14345 "* wants int");
14346 return -1;
14347 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014348 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014349 if (arg->width == -1 && PyErr_Occurred())
14350 return -1;
14351 if (arg->width < 0) {
14352 arg->flags |= F_LJUST;
14353 arg->width = -arg->width;
14354 }
14355 if (--ctx->fmtcnt >= 0) {
14356 arg->ch = FORMAT_READ(ctx);
14357 ctx->fmtpos++;
14358 }
14359 }
14360 else if (arg->ch >= '0' && arg->ch <= '9') {
14361 arg->width = arg->ch - '0';
14362 while (--ctx->fmtcnt >= 0) {
14363 arg->ch = FORMAT_READ(ctx);
14364 ctx->fmtpos++;
14365 if (arg->ch < '0' || arg->ch > '9')
14366 break;
14367 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14368 mixing signed and unsigned comparison. Since arg->ch is between
14369 '0' and '9', casting to int is safe. */
14370 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14371 PyErr_SetString(PyExc_ValueError,
14372 "width too big");
14373 return -1;
14374 }
14375 arg->width = arg->width*10 + (arg->ch - '0');
14376 }
14377 }
14378
14379 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014380 if (arg->ch == '.') {
14381 arg->prec = 0;
14382 if (--ctx->fmtcnt >= 0) {
14383 arg->ch = FORMAT_READ(ctx);
14384 ctx->fmtpos++;
14385 }
14386 if (arg->ch == '*') {
14387 v = unicode_format_getnextarg(ctx);
14388 if (v == NULL)
14389 return -1;
14390 if (!PyLong_Check(v)) {
14391 PyErr_SetString(PyExc_TypeError,
14392 "* wants int");
14393 return -1;
14394 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014395 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014396 if (arg->prec == -1 && PyErr_Occurred())
14397 return -1;
14398 if (arg->prec < 0)
14399 arg->prec = 0;
14400 if (--ctx->fmtcnt >= 0) {
14401 arg->ch = FORMAT_READ(ctx);
14402 ctx->fmtpos++;
14403 }
14404 }
14405 else if (arg->ch >= '0' && arg->ch <= '9') {
14406 arg->prec = arg->ch - '0';
14407 while (--ctx->fmtcnt >= 0) {
14408 arg->ch = FORMAT_READ(ctx);
14409 ctx->fmtpos++;
14410 if (arg->ch < '0' || arg->ch > '9')
14411 break;
14412 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14413 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014414 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014415 return -1;
14416 }
14417 arg->prec = arg->prec*10 + (arg->ch - '0');
14418 }
14419 }
14420 }
14421
14422 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14423 if (ctx->fmtcnt >= 0) {
14424 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14425 if (--ctx->fmtcnt >= 0) {
14426 arg->ch = FORMAT_READ(ctx);
14427 ctx->fmtpos++;
14428 }
14429 }
14430 }
14431 if (ctx->fmtcnt < 0) {
14432 PyErr_SetString(PyExc_ValueError,
14433 "incomplete format");
14434 return -1;
14435 }
14436 return 0;
14437
14438#undef FORMAT_READ
14439}
14440
14441/* Format one argument. Supported conversion specifiers:
14442
14443 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014444 - "i", "d", "u": int or float
14445 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014446 - "e", "E", "f", "F", "g", "G": float
14447 - "c": int or str (1 character)
14448
Victor Stinner8dbd4212012-12-04 09:30:24 +010014449 When possible, the output is written directly into the Unicode writer
14450 (ctx->writer). A string is created when padding is required.
14451
Victor Stinnera47082312012-10-04 02:19:54 +020014452 Return 0 if the argument has been formatted into *p_str,
14453 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014454 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014455static int
14456unicode_format_arg_format(struct unicode_formatter_t *ctx,
14457 struct unicode_format_arg_t *arg,
14458 PyObject **p_str)
14459{
14460 PyObject *v;
14461 _PyUnicodeWriter *writer = &ctx->writer;
14462
14463 if (ctx->fmtcnt == 0)
14464 ctx->writer.overallocate = 0;
14465
14466 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014467 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014468 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014469 return 1;
14470 }
14471
14472 v = unicode_format_getnextarg(ctx);
14473 if (v == NULL)
14474 return -1;
14475
Victor Stinnera47082312012-10-04 02:19:54 +020014476
14477 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014478 case 's':
14479 case 'r':
14480 case 'a':
14481 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14482 /* Fast path */
14483 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14484 return -1;
14485 return 1;
14486 }
14487
14488 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14489 *p_str = v;
14490 Py_INCREF(*p_str);
14491 }
14492 else {
14493 if (arg->ch == 's')
14494 *p_str = PyObject_Str(v);
14495 else if (arg->ch == 'r')
14496 *p_str = PyObject_Repr(v);
14497 else
14498 *p_str = PyObject_ASCII(v);
14499 }
14500 break;
14501
14502 case 'i':
14503 case 'd':
14504 case 'u':
14505 case 'o':
14506 case 'x':
14507 case 'X':
14508 {
14509 int ret = mainformatlong(v, arg, p_str, writer);
14510 if (ret != 0)
14511 return ret;
14512 arg->sign = 1;
14513 break;
14514 }
14515
14516 case 'e':
14517 case 'E':
14518 case 'f':
14519 case 'F':
14520 case 'g':
14521 case 'G':
14522 if (arg->width == -1 && arg->prec == -1
14523 && !(arg->flags & (F_SIGN | F_BLANK)))
14524 {
14525 /* Fast path */
14526 if (formatfloat(v, arg, NULL, writer) == -1)
14527 return -1;
14528 return 1;
14529 }
14530
14531 arg->sign = 1;
14532 if (formatfloat(v, arg, p_str, NULL) == -1)
14533 return -1;
14534 break;
14535
14536 case 'c':
14537 {
14538 Py_UCS4 ch = formatchar(v);
14539 if (ch == (Py_UCS4) -1)
14540 return -1;
14541 if (arg->width == -1 && arg->prec == -1) {
14542 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014543 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014544 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014545 return 1;
14546 }
14547 *p_str = PyUnicode_FromOrdinal(ch);
14548 break;
14549 }
14550
14551 default:
14552 PyErr_Format(PyExc_ValueError,
14553 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014554 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014555 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14556 (int)arg->ch,
14557 ctx->fmtpos - 1);
14558 return -1;
14559 }
14560 if (*p_str == NULL)
14561 return -1;
14562 assert (PyUnicode_Check(*p_str));
14563 return 0;
14564}
14565
14566static int
14567unicode_format_arg_output(struct unicode_formatter_t *ctx,
14568 struct unicode_format_arg_t *arg,
14569 PyObject *str)
14570{
14571 Py_ssize_t len;
14572 enum PyUnicode_Kind kind;
14573 void *pbuf;
14574 Py_ssize_t pindex;
14575 Py_UCS4 signchar;
14576 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014577 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014578 Py_ssize_t sublen;
14579 _PyUnicodeWriter *writer = &ctx->writer;
14580 Py_UCS4 fill;
14581
14582 fill = ' ';
14583 if (arg->sign && arg->flags & F_ZERO)
14584 fill = '0';
14585
14586 if (PyUnicode_READY(str) == -1)
14587 return -1;
14588
14589 len = PyUnicode_GET_LENGTH(str);
14590 if ((arg->width == -1 || arg->width <= len)
14591 && (arg->prec == -1 || arg->prec >= len)
14592 && !(arg->flags & (F_SIGN | F_BLANK)))
14593 {
14594 /* Fast path */
14595 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14596 return -1;
14597 return 0;
14598 }
14599
14600 /* Truncate the string for "s", "r" and "a" formats
14601 if the precision is set */
14602 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14603 if (arg->prec >= 0 && len > arg->prec)
14604 len = arg->prec;
14605 }
14606
14607 /* Adjust sign and width */
14608 kind = PyUnicode_KIND(str);
14609 pbuf = PyUnicode_DATA(str);
14610 pindex = 0;
14611 signchar = '\0';
14612 if (arg->sign) {
14613 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14614 if (ch == '-' || ch == '+') {
14615 signchar = ch;
14616 len--;
14617 pindex++;
14618 }
14619 else if (arg->flags & F_SIGN)
14620 signchar = '+';
14621 else if (arg->flags & F_BLANK)
14622 signchar = ' ';
14623 else
14624 arg->sign = 0;
14625 }
14626 if (arg->width < len)
14627 arg->width = len;
14628
14629 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014630 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014631 if (!(arg->flags & F_LJUST)) {
14632 if (arg->sign) {
14633 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014634 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014635 }
14636 else {
14637 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014638 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014639 }
14640 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014641 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14642 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014643 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014644 }
14645
Victor Stinnera47082312012-10-04 02:19:54 +020014646 buflen = arg->width;
14647 if (arg->sign && len == arg->width)
14648 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014649 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014650 return -1;
14651
14652 /* Write the sign if needed */
14653 if (arg->sign) {
14654 if (fill != ' ') {
14655 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14656 writer->pos += 1;
14657 }
14658 if (arg->width > len)
14659 arg->width--;
14660 }
14661
14662 /* Write the numeric prefix for "x", "X" and "o" formats
14663 if the alternate form is used.
14664 For example, write "0x" for the "%#x" format. */
14665 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14666 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14667 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14668 if (fill != ' ') {
14669 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14670 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14671 writer->pos += 2;
14672 pindex += 2;
14673 }
14674 arg->width -= 2;
14675 if (arg->width < 0)
14676 arg->width = 0;
14677 len -= 2;
14678 }
14679
14680 /* Pad left with the fill character if needed */
14681 if (arg->width > len && !(arg->flags & F_LJUST)) {
14682 sublen = arg->width - len;
14683 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14684 writer->pos += sublen;
14685 arg->width = len;
14686 }
14687
14688 /* If padding with spaces: write sign if needed and/or numeric prefix if
14689 the alternate form is used */
14690 if (fill == ' ') {
14691 if (arg->sign) {
14692 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14693 writer->pos += 1;
14694 }
14695 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14696 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14697 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14698 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14699 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14700 writer->pos += 2;
14701 pindex += 2;
14702 }
14703 }
14704
14705 /* Write characters */
14706 if (len) {
14707 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14708 str, pindex, len);
14709 writer->pos += len;
14710 }
14711
14712 /* Pad right with the fill character if needed */
14713 if (arg->width > len) {
14714 sublen = arg->width - len;
14715 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14716 writer->pos += sublen;
14717 }
14718 return 0;
14719}
14720
14721/* Helper of PyUnicode_Format(): format one arg.
14722 Return 0 on success, raise an exception and return -1 on error. */
14723static int
14724unicode_format_arg(struct unicode_formatter_t *ctx)
14725{
14726 struct unicode_format_arg_t arg;
14727 PyObject *str;
14728 int ret;
14729
Victor Stinner8dbd4212012-12-04 09:30:24 +010014730 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14731 arg.flags = 0;
14732 arg.width = -1;
14733 arg.prec = -1;
14734 arg.sign = 0;
14735 str = NULL;
14736
Victor Stinnera47082312012-10-04 02:19:54 +020014737 ret = unicode_format_arg_parse(ctx, &arg);
14738 if (ret == -1)
14739 return -1;
14740
14741 ret = unicode_format_arg_format(ctx, &arg, &str);
14742 if (ret == -1)
14743 return -1;
14744
14745 if (ret != 1) {
14746 ret = unicode_format_arg_output(ctx, &arg, str);
14747 Py_DECREF(str);
14748 if (ret == -1)
14749 return -1;
14750 }
14751
14752 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14753 PyErr_SetString(PyExc_TypeError,
14754 "not all arguments converted during string formatting");
14755 return -1;
14756 }
14757 return 0;
14758}
14759
Alexander Belopolsky40018472011-02-26 01:02:56 +000014760PyObject *
14761PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014762{
Victor Stinnera47082312012-10-04 02:19:54 +020014763 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014764
Guido van Rossumd57fd912000-03-10 22:53:23 +000014765 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014766 PyErr_BadInternalCall();
14767 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014768 }
Victor Stinnera47082312012-10-04 02:19:54 +020014769
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014770 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014771 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014772
14773 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014774 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14775 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14776 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14777 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014778
Victor Stinner8f674cc2013-04-17 23:02:17 +020014779 _PyUnicodeWriter_Init(&ctx.writer);
14780 ctx.writer.min_length = ctx.fmtcnt + 100;
14781 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014782
Guido van Rossumd57fd912000-03-10 22:53:23 +000014783 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014784 ctx.arglen = PyTuple_Size(args);
14785 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014786 }
14787 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014788 ctx.arglen = -1;
14789 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014790 }
Victor Stinnera47082312012-10-04 02:19:54 +020014791 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014792 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014793 ctx.dict = args;
14794 else
14795 ctx.dict = NULL;
14796 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014797
Victor Stinnera47082312012-10-04 02:19:54 +020014798 while (--ctx.fmtcnt >= 0) {
14799 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014800 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014801
14802 nonfmtpos = ctx.fmtpos++;
14803 while (ctx.fmtcnt >= 0 &&
14804 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14805 ctx.fmtpos++;
14806 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014807 }
Victor Stinnera47082312012-10-04 02:19:54 +020014808 if (ctx.fmtcnt < 0) {
14809 ctx.fmtpos--;
14810 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014811 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014812
Victor Stinnercfc4c132013-04-03 01:48:39 +020014813 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14814 nonfmtpos, ctx.fmtpos) < 0)
14815 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014816 }
14817 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014818 ctx.fmtpos++;
14819 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014820 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014821 }
14822 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014823
Victor Stinnera47082312012-10-04 02:19:54 +020014824 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014825 PyErr_SetString(PyExc_TypeError,
14826 "not all arguments converted during string formatting");
14827 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014828 }
14829
Victor Stinnera47082312012-10-04 02:19:54 +020014830 if (ctx.args_owned) {
14831 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014832 }
Victor Stinnera47082312012-10-04 02:19:54 +020014833 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014834
Benjamin Peterson29060642009-01-31 22:14:21 +000014835 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014836 _PyUnicodeWriter_Dealloc(&ctx.writer);
14837 if (ctx.args_owned) {
14838 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014839 }
14840 return NULL;
14841}
14842
Jeremy Hylton938ace62002-07-17 16:30:39 +000014843static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014844unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14845
Tim Peters6d6c1a32001-08-02 04:15:00 +000014846static PyObject *
14847unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14848{
Benjamin Peterson29060642009-01-31 22:14:21 +000014849 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014850 static char *kwlist[] = {"object", "encoding", "errors", 0};
14851 char *encoding = NULL;
14852 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014853
Benjamin Peterson14339b62009-01-31 16:36:08 +000014854 if (type != &PyUnicode_Type)
14855 return unicode_subtype_new(type, args, kwds);
14856 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014857 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014858 return NULL;
14859 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014860 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014861 if (encoding == NULL && errors == NULL)
14862 return PyObject_Str(x);
14863 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014864 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014865}
14866
Guido van Rossume023fe02001-08-30 03:12:59 +000014867static PyObject *
14868unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14869{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014870 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014871 Py_ssize_t length, char_size;
14872 int share_wstr, share_utf8;
14873 unsigned int kind;
14874 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014875
Benjamin Peterson14339b62009-01-31 16:36:08 +000014876 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014877
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014878 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014879 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014880 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014881 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014882 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014883 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014884 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014885 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014886
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014887 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014888 if (self == NULL) {
14889 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014890 return NULL;
14891 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014892 kind = PyUnicode_KIND(unicode);
14893 length = PyUnicode_GET_LENGTH(unicode);
14894
14895 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014896#ifdef Py_DEBUG
14897 _PyUnicode_HASH(self) = -1;
14898#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014899 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014900#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014901 _PyUnicode_STATE(self).interned = 0;
14902 _PyUnicode_STATE(self).kind = kind;
14903 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014904 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014905 _PyUnicode_STATE(self).ready = 1;
14906 _PyUnicode_WSTR(self) = NULL;
14907 _PyUnicode_UTF8_LENGTH(self) = 0;
14908 _PyUnicode_UTF8(self) = NULL;
14909 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014910 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014911
14912 share_utf8 = 0;
14913 share_wstr = 0;
14914 if (kind == PyUnicode_1BYTE_KIND) {
14915 char_size = 1;
14916 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14917 share_utf8 = 1;
14918 }
14919 else if (kind == PyUnicode_2BYTE_KIND) {
14920 char_size = 2;
14921 if (sizeof(wchar_t) == 2)
14922 share_wstr = 1;
14923 }
14924 else {
14925 assert(kind == PyUnicode_4BYTE_KIND);
14926 char_size = 4;
14927 if (sizeof(wchar_t) == 4)
14928 share_wstr = 1;
14929 }
14930
14931 /* Ensure we won't overflow the length. */
14932 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14933 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014934 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014935 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014936 data = PyObject_MALLOC((length + 1) * char_size);
14937 if (data == NULL) {
14938 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014939 goto onError;
14940 }
14941
Victor Stinnerc3c74152011-10-02 20:39:55 +020014942 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014943 if (share_utf8) {
14944 _PyUnicode_UTF8_LENGTH(self) = length;
14945 _PyUnicode_UTF8(self) = data;
14946 }
14947 if (share_wstr) {
14948 _PyUnicode_WSTR_LENGTH(self) = length;
14949 _PyUnicode_WSTR(self) = (wchar_t *)data;
14950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014951
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014952 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014953 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014954 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014955#ifdef Py_DEBUG
14956 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14957#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014958 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014959 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014960
14961onError:
14962 Py_DECREF(unicode);
14963 Py_DECREF(self);
14964 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014965}
14966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014967PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014968"str(object='') -> str\n\
14969str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014970\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014971Create a new string object from the given object. If encoding or\n\
14972errors is specified, then the object must expose a data buffer\n\
14973that will be decoded using the given encoding and error handler.\n\
14974Otherwise, returns the result of object.__str__() (if defined)\n\
14975or repr(object).\n\
14976encoding defaults to sys.getdefaultencoding().\n\
14977errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014978
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014979static PyObject *unicode_iter(PyObject *seq);
14980
Guido van Rossumd57fd912000-03-10 22:53:23 +000014981PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014982 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014983 "str", /* tp_name */
14984 sizeof(PyUnicodeObject), /* tp_size */
14985 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014986 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014987 (destructor)unicode_dealloc, /* tp_dealloc */
14988 0, /* tp_print */
14989 0, /* tp_getattr */
14990 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014991 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014992 unicode_repr, /* tp_repr */
14993 &unicode_as_number, /* tp_as_number */
14994 &unicode_as_sequence, /* tp_as_sequence */
14995 &unicode_as_mapping, /* tp_as_mapping */
14996 (hashfunc) unicode_hash, /* tp_hash*/
14997 0, /* tp_call*/
14998 (reprfunc) unicode_str, /* tp_str */
14999 PyObject_GenericGetAttr, /* tp_getattro */
15000 0, /* tp_setattro */
15001 0, /* tp_as_buffer */
15002 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015003 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015004 unicode_doc, /* tp_doc */
15005 0, /* tp_traverse */
15006 0, /* tp_clear */
15007 PyUnicode_RichCompare, /* tp_richcompare */
15008 0, /* tp_weaklistoffset */
15009 unicode_iter, /* tp_iter */
15010 0, /* tp_iternext */
15011 unicode_methods, /* tp_methods */
15012 0, /* tp_members */
15013 0, /* tp_getset */
15014 &PyBaseObject_Type, /* tp_base */
15015 0, /* tp_dict */
15016 0, /* tp_descr_get */
15017 0, /* tp_descr_set */
15018 0, /* tp_dictoffset */
15019 0, /* tp_init */
15020 0, /* tp_alloc */
15021 unicode_new, /* tp_new */
15022 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015023};
15024
15025/* Initialize the Unicode implementation */
15026
Victor Stinner3a50e702011-10-18 21:21:00 +020015027int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015028{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015029 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015030 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015031 0x000A, /* LINE FEED */
15032 0x000D, /* CARRIAGE RETURN */
15033 0x001C, /* FILE SEPARATOR */
15034 0x001D, /* GROUP SEPARATOR */
15035 0x001E, /* RECORD SEPARATOR */
15036 0x0085, /* NEXT LINE */
15037 0x2028, /* LINE SEPARATOR */
15038 0x2029, /* PARAGRAPH SEPARATOR */
15039 };
15040
Fred Drakee4315f52000-05-09 19:53:39 +000015041 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015042 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015043 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015044 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015045 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015046
Guido van Rossumcacfc072002-05-24 19:01:59 +000015047 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015048 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015049
15050 /* initialize the linebreak bloom filter */
15051 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015052 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015053 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015054
Christian Heimes26532f72013-07-20 14:57:16 +020015055 if (PyType_Ready(&EncodingMapType) < 0)
15056 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015057
Benjamin Petersonc4311282012-10-30 23:21:10 -040015058 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15059 Py_FatalError("Can't initialize field name iterator type");
15060
15061 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15062 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015063
Victor Stinner3a50e702011-10-18 21:21:00 +020015064 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015065}
15066
15067/* Finalize the Unicode implementation */
15068
Christian Heimesa156e092008-02-16 07:38:31 +000015069int
15070PyUnicode_ClearFreeList(void)
15071{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015072 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015073}
15074
Guido van Rossumd57fd912000-03-10 22:53:23 +000015075void
Thomas Wouters78890102000-07-22 19:25:51 +000015076_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015077{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015078 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015079
Serhiy Storchaka05997252013-01-26 12:14:02 +020015080 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015081
Serhiy Storchaka05997252013-01-26 12:14:02 +020015082 for (i = 0; i < 256; i++)
15083 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015084 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015085 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015086}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015087
Walter Dörwald16807132007-05-25 13:52:07 +000015088void
15089PyUnicode_InternInPlace(PyObject **p)
15090{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015091 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015092 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015093#ifdef Py_DEBUG
15094 assert(s != NULL);
15095 assert(_PyUnicode_CHECK(s));
15096#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015097 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015098 return;
15099#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015100 /* If it's a subclass, we don't really know what putting
15101 it in the interned dict might do. */
15102 if (!PyUnicode_CheckExact(s))
15103 return;
15104 if (PyUnicode_CHECK_INTERNED(s))
15105 return;
15106 if (interned == NULL) {
15107 interned = PyDict_New();
15108 if (interned == NULL) {
15109 PyErr_Clear(); /* Don't leave an exception */
15110 return;
15111 }
15112 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015113 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015114 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015115 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015116 if (t == NULL) {
15117 PyErr_Clear();
15118 return;
15119 }
15120 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015121 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015122 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015123 return;
15124 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015125 /* The two references in interned are not counted by refcnt.
15126 The deallocator will take care of this */
15127 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015128 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015129}
15130
15131void
15132PyUnicode_InternImmortal(PyObject **p)
15133{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015134 PyUnicode_InternInPlace(p);
15135 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015136 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015137 Py_INCREF(*p);
15138 }
Walter Dörwald16807132007-05-25 13:52:07 +000015139}
15140
15141PyObject *
15142PyUnicode_InternFromString(const char *cp)
15143{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015144 PyObject *s = PyUnicode_FromString(cp);
15145 if (s == NULL)
15146 return NULL;
15147 PyUnicode_InternInPlace(&s);
15148 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015149}
15150
Alexander Belopolsky40018472011-02-26 01:02:56 +000015151void
15152_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015153{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015154 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015155 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015156 Py_ssize_t i, n;
15157 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015158
Benjamin Peterson14339b62009-01-31 16:36:08 +000015159 if (interned == NULL || !PyDict_Check(interned))
15160 return;
15161 keys = PyDict_Keys(interned);
15162 if (keys == NULL || !PyList_Check(keys)) {
15163 PyErr_Clear();
15164 return;
15165 }
Walter Dörwald16807132007-05-25 13:52:07 +000015166
Benjamin Peterson14339b62009-01-31 16:36:08 +000015167 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15168 detector, interned unicode strings are not forcibly deallocated;
15169 rather, we give them their stolen references back, and then clear
15170 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015171
Benjamin Peterson14339b62009-01-31 16:36:08 +000015172 n = PyList_GET_SIZE(keys);
15173 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015174 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015175 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015176 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015177 if (PyUnicode_READY(s) == -1) {
15178 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015179 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015180 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015181 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015182 case SSTATE_NOT_INTERNED:
15183 /* XXX Shouldn't happen */
15184 break;
15185 case SSTATE_INTERNED_IMMORTAL:
15186 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015187 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015188 break;
15189 case SSTATE_INTERNED_MORTAL:
15190 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015191 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015192 break;
15193 default:
15194 Py_FatalError("Inconsistent interned string state.");
15195 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015196 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015197 }
15198 fprintf(stderr, "total size of all interned strings: "
15199 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15200 "mortal/immortal\n", mortal_size, immortal_size);
15201 Py_DECREF(keys);
15202 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015203 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015204}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015205
15206
15207/********************* Unicode Iterator **************************/
15208
15209typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015210 PyObject_HEAD
15211 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015212 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015213} unicodeiterobject;
15214
15215static void
15216unicodeiter_dealloc(unicodeiterobject *it)
15217{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015218 _PyObject_GC_UNTRACK(it);
15219 Py_XDECREF(it->it_seq);
15220 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015221}
15222
15223static int
15224unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15225{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015226 Py_VISIT(it->it_seq);
15227 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015228}
15229
15230static PyObject *
15231unicodeiter_next(unicodeiterobject *it)
15232{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015233 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015234
Benjamin Peterson14339b62009-01-31 16:36:08 +000015235 assert(it != NULL);
15236 seq = it->it_seq;
15237 if (seq == NULL)
15238 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015239 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015241 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15242 int kind = PyUnicode_KIND(seq);
15243 void *data = PyUnicode_DATA(seq);
15244 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15245 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015246 if (item != NULL)
15247 ++it->it_index;
15248 return item;
15249 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015250
Benjamin Peterson14339b62009-01-31 16:36:08 +000015251 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015252 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015253 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015254}
15255
15256static PyObject *
15257unicodeiter_len(unicodeiterobject *it)
15258{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015259 Py_ssize_t len = 0;
15260 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015261 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015262 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015263}
15264
15265PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15266
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015267static PyObject *
15268unicodeiter_reduce(unicodeiterobject *it)
15269{
15270 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015271 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015272 it->it_seq, it->it_index);
15273 } else {
15274 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15275 if (u == NULL)
15276 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015277 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015278 }
15279}
15280
15281PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15282
15283static PyObject *
15284unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15285{
15286 Py_ssize_t index = PyLong_AsSsize_t(state);
15287 if (index == -1 && PyErr_Occurred())
15288 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015289 if (it->it_seq != NULL) {
15290 if (index < 0)
15291 index = 0;
15292 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15293 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15294 it->it_index = index;
15295 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015296 Py_RETURN_NONE;
15297}
15298
15299PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15300
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015301static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015302 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015303 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015304 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15305 reduce_doc},
15306 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15307 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015308 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015309};
15310
15311PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015312 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15313 "str_iterator", /* tp_name */
15314 sizeof(unicodeiterobject), /* tp_basicsize */
15315 0, /* tp_itemsize */
15316 /* methods */
15317 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15318 0, /* tp_print */
15319 0, /* tp_getattr */
15320 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015321 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015322 0, /* tp_repr */
15323 0, /* tp_as_number */
15324 0, /* tp_as_sequence */
15325 0, /* tp_as_mapping */
15326 0, /* tp_hash */
15327 0, /* tp_call */
15328 0, /* tp_str */
15329 PyObject_GenericGetAttr, /* tp_getattro */
15330 0, /* tp_setattro */
15331 0, /* tp_as_buffer */
15332 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15333 0, /* tp_doc */
15334 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15335 0, /* tp_clear */
15336 0, /* tp_richcompare */
15337 0, /* tp_weaklistoffset */
15338 PyObject_SelfIter, /* tp_iter */
15339 (iternextfunc)unicodeiter_next, /* tp_iternext */
15340 unicodeiter_methods, /* tp_methods */
15341 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015342};
15343
15344static PyObject *
15345unicode_iter(PyObject *seq)
15346{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015347 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015348
Benjamin Peterson14339b62009-01-31 16:36:08 +000015349 if (!PyUnicode_Check(seq)) {
15350 PyErr_BadInternalCall();
15351 return NULL;
15352 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015353 if (PyUnicode_READY(seq) == -1)
15354 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015355 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15356 if (it == NULL)
15357 return NULL;
15358 it->it_index = 0;
15359 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015360 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015361 _PyObject_GC_TRACK(it);
15362 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015363}
15364
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015365
15366size_t
15367Py_UNICODE_strlen(const Py_UNICODE *u)
15368{
15369 int res = 0;
15370 while(*u++)
15371 res++;
15372 return res;
15373}
15374
15375Py_UNICODE*
15376Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15377{
15378 Py_UNICODE *u = s1;
15379 while ((*u++ = *s2++));
15380 return s1;
15381}
15382
15383Py_UNICODE*
15384Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15385{
15386 Py_UNICODE *u = s1;
15387 while ((*u++ = *s2++))
15388 if (n-- == 0)
15389 break;
15390 return s1;
15391}
15392
15393Py_UNICODE*
15394Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15395{
15396 Py_UNICODE *u1 = s1;
15397 u1 += Py_UNICODE_strlen(u1);
15398 Py_UNICODE_strcpy(u1, s2);
15399 return s1;
15400}
15401
15402int
15403Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15404{
15405 while (*s1 && *s2 && *s1 == *s2)
15406 s1++, s2++;
15407 if (*s1 && *s2)
15408 return (*s1 < *s2) ? -1 : +1;
15409 if (*s1)
15410 return 1;
15411 if (*s2)
15412 return -1;
15413 return 0;
15414}
15415
15416int
15417Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15418{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015419 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015420 for (; n != 0; n--) {
15421 u1 = *s1;
15422 u2 = *s2;
15423 if (u1 != u2)
15424 return (u1 < u2) ? -1 : +1;
15425 if (u1 == '\0')
15426 return 0;
15427 s1++;
15428 s2++;
15429 }
15430 return 0;
15431}
15432
15433Py_UNICODE*
15434Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15435{
15436 const Py_UNICODE *p;
15437 for (p = s; *p; p++)
15438 if (*p == c)
15439 return (Py_UNICODE*)p;
15440 return NULL;
15441}
15442
15443Py_UNICODE*
15444Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15445{
15446 const Py_UNICODE *p;
15447 p = s + Py_UNICODE_strlen(s);
15448 while (p != s) {
15449 p--;
15450 if (*p == c)
15451 return (Py_UNICODE*)p;
15452 }
15453 return NULL;
15454}
Victor Stinner331ea922010-08-10 16:37:20 +000015455
Victor Stinner71133ff2010-09-01 23:43:53 +000015456Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015457PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015458{
Victor Stinner577db2c2011-10-11 22:12:48 +020015459 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015460 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015462 if (!PyUnicode_Check(unicode)) {
15463 PyErr_BadArgument();
15464 return NULL;
15465 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015466 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015467 if (u == NULL)
15468 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015469 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015470 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015471 PyErr_NoMemory();
15472 return NULL;
15473 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015474 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015475 size *= sizeof(Py_UNICODE);
15476 copy = PyMem_Malloc(size);
15477 if (copy == NULL) {
15478 PyErr_NoMemory();
15479 return NULL;
15480 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015481 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015482 return copy;
15483}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015484
Georg Brandl66c221e2010-10-14 07:04:07 +000015485/* A _string module, to export formatter_parser and formatter_field_name_split
15486 to the string.Formatter class implemented in Python. */
15487
15488static PyMethodDef _string_methods[] = {
15489 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15490 METH_O, PyDoc_STR("split the argument as a field name")},
15491 {"formatter_parser", (PyCFunction) formatter_parser,
15492 METH_O, PyDoc_STR("parse the argument as a format string")},
15493 {NULL, NULL}
15494};
15495
15496static struct PyModuleDef _string_module = {
15497 PyModuleDef_HEAD_INIT,
15498 "_string",
15499 PyDoc_STR("string helper module"),
15500 0,
15501 _string_methods,
15502 NULL,
15503 NULL,
15504 NULL,
15505 NULL
15506};
15507
15508PyMODINIT_FUNC
15509PyInit__string(void)
15510{
15511 return PyModule_Create(&_string_module);
15512}
15513
15514
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015515#ifdef __cplusplus
15516}
15517#endif