blob: 7de44ce6e7af2b579f56d639b346663c5db0da00 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
207Py_LOCAL_INLINE(int)
208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0)
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
321 if (strcmp(errors, "surrogateescape") == 0)
322 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner50149202015-09-22 00:26:54 +0200323 if (strcmp(errors, "replace") == 0)
324 return _Py_ERROR_REPLACE;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200325 if (strcmp(errors, "ignore") == 0)
326 return _Py_ERROR_IGNORE;
327 if (strcmp(errors, "backslashreplace") == 0)
328 return _Py_ERROR_BACKSLASHREPLACE;
329 if (strcmp(errors, "surrogatepass") == 0)
330 return _Py_ERROR_SURROGATEPASS;
Victor Stinner50149202015-09-22 00:26:54 +0200331 if (strcmp(errors, "xmlcharrefreplace") == 0)
332 return _Py_ERROR_XMLCHARREFREPLACE;
333 return _Py_ERROR_OTHER;
334}
335
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300336/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
337 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000338Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000339PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000340{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000341#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000342 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000343#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 /* This is actually an illegal character, so it should
345 not be passed to unichr. */
346 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347#endif
348}
349
Victor Stinner910337b2011-10-03 03:20:16 +0200350#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200351int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100352_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200353{
354 PyASCIIObject *ascii;
355 unsigned int kind;
356
357 assert(PyUnicode_Check(op));
358
359 ascii = (PyASCIIObject *)op;
360 kind = ascii->state.kind;
361
Victor Stinnera3b334d2011-10-03 13:53:37 +0200362 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200363 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200364 assert(ascii->state.ready == 1);
365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200367 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200368 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200369
Victor Stinnera41463c2011-10-04 01:05:08 +0200370 if (ascii->state.compact == 1) {
371 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200372 assert(kind == PyUnicode_1BYTE_KIND
373 || kind == PyUnicode_2BYTE_KIND
374 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200375 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200376 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100378 }
379 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200380 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
381
382 data = unicode->data.any;
383 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100384 assert(ascii->length == 0);
385 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200386 assert(ascii->state.compact == 0);
387 assert(ascii->state.ascii == 0);
388 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100389 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200390 assert(ascii->wstr != NULL);
391 assert(data == NULL);
392 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 }
394 else {
395 assert(kind == PyUnicode_1BYTE_KIND
396 || kind == PyUnicode_2BYTE_KIND
397 || kind == PyUnicode_4BYTE_KIND);
398 assert(ascii->state.compact == 0);
399 assert(ascii->state.ready == 1);
400 assert(data != NULL);
401 if (ascii->state.ascii) {
402 assert (compact->utf8 == data);
403 assert (compact->utf8_length == ascii->length);
404 }
405 else
406 assert (compact->utf8 != data);
407 }
408 }
409 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200410 if (
411#if SIZEOF_WCHAR_T == 2
412 kind == PyUnicode_2BYTE_KIND
413#else
414 kind == PyUnicode_4BYTE_KIND
415#endif
416 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200417 {
418 assert(ascii->wstr == data);
419 assert(compact->wstr_length == ascii->length);
420 } else
421 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200422 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200423
424 if (compact->utf8 == NULL)
425 assert(compact->utf8_length == 0);
426 if (ascii->wstr == NULL)
427 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200428 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200429 /* check that the best kind is used */
430 if (check_content && kind != PyUnicode_WCHAR_KIND)
431 {
432 Py_ssize_t i;
433 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200434 void *data;
435 Py_UCS4 ch;
436
437 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200438 for (i=0; i < ascii->length; i++)
439 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200440 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200441 if (ch > maxchar)
442 maxchar = ch;
443 }
444 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100445 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200446 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100447 assert(maxchar <= 255);
448 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200449 else
450 assert(maxchar < 128);
451 }
Victor Stinner77faf692011-11-20 18:56:05 +0100452 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 0xFFFF);
455 }
456 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200457 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100458 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100459 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200460 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200461 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400462 return 1;
463}
Victor Stinner910337b2011-10-03 03:20:16 +0200464#endif
465
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466static PyObject*
467unicode_result_wchar(PyObject *unicode)
468{
469#ifndef Py_DEBUG
470 Py_ssize_t len;
471
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100472 len = _PyUnicode_WSTR_LENGTH(unicode);
473 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100474 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200475 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 }
477
478 if (len == 1) {
479 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100480 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
482 Py_DECREF(unicode);
483 return latin1_char;
484 }
485 }
486
487 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200488 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100489 return NULL;
490 }
491#else
Victor Stinneraa771272012-10-04 02:32:58 +0200492 assert(Py_REFCNT(unicode) == 1);
493
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100494 /* don't make the result ready in debug mode to ensure that the caller
495 makes the string ready before using it */
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497#endif
498 return unicode;
499}
500
501static PyObject*
502unicode_result_ready(PyObject *unicode)
503{
504 Py_ssize_t length;
505
506 length = PyUnicode_GET_LENGTH(unicode);
507 if (length == 0) {
508 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100509 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200510 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100511 }
512 return unicode_empty;
513 }
514
515 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200516 void *data = PyUnicode_DATA(unicode);
517 int kind = PyUnicode_KIND(unicode);
518 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100519 if (ch < 256) {
520 PyObject *latin1_char = unicode_latin1[ch];
521 if (latin1_char != NULL) {
522 if (unicode != latin1_char) {
523 Py_INCREF(latin1_char);
524 Py_DECREF(unicode);
525 }
526 return latin1_char;
527 }
528 else {
529 assert(_PyUnicode_CheckConsistency(unicode, 1));
530 Py_INCREF(unicode);
531 unicode_latin1[ch] = unicode;
532 return unicode;
533 }
534 }
535 }
536
537 assert(_PyUnicode_CheckConsistency(unicode, 1));
538 return unicode;
539}
540
541static PyObject*
542unicode_result(PyObject *unicode)
543{
544 assert(_PyUnicode_CHECK(unicode));
545 if (PyUnicode_IS_READY(unicode))
546 return unicode_result_ready(unicode);
547 else
548 return unicode_result_wchar(unicode);
549}
550
Victor Stinnerc4b49542011-12-11 22:44:26 +0100551static PyObject*
552unicode_result_unchanged(PyObject *unicode)
553{
554 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500555 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100556 return NULL;
557 Py_INCREF(unicode);
558 return unicode;
559 }
560 else
561 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100562 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563}
564
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200565/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
566 ASCII, Latin1, UTF-8, etc. */
567static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200568backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200569 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
570{
Victor Stinnerad771582015-10-09 12:38:53 +0200571 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572 Py_UCS4 ch;
573 enum PyUnicode_Kind kind;
574 void *data;
575
576 assert(PyUnicode_IS_READY(unicode));
577 kind = PyUnicode_KIND(unicode);
578 data = PyUnicode_DATA(unicode);
579
580 size = 0;
581 /* determine replacement size */
582 for (i = collstart; i < collend; ++i) {
583 Py_ssize_t incr;
584
585 ch = PyUnicode_READ(kind, data, i);
586 if (ch < 0x100)
587 incr = 2+2;
588 else if (ch < 0x10000)
589 incr = 2+4;
590 else {
591 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200592 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200593 }
594 if (size > PY_SSIZE_T_MAX - incr) {
595 PyErr_SetString(PyExc_OverflowError,
596 "encoded result is too long for a Python string");
597 return NULL;
598 }
599 size += incr;
600 }
601
Victor Stinnerad771582015-10-09 12:38:53 +0200602 str = _PyBytesWriter_Prepare(writer, str, size);
603 if (str == NULL)
604 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200605
606 /* generate replacement */
607 for (i = collstart; i < collend; ++i) {
608 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200609 *str++ = '\\';
610 if (ch >= 0x00010000) {
611 *str++ = 'U';
612 *str++ = Py_hexdigits[(ch>>28)&0xf];
613 *str++ = Py_hexdigits[(ch>>24)&0xf];
614 *str++ = Py_hexdigits[(ch>>20)&0xf];
615 *str++ = Py_hexdigits[(ch>>16)&0xf];
616 *str++ = Py_hexdigits[(ch>>12)&0xf];
617 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200618 }
Victor Stinner797485e2015-10-09 03:17:30 +0200619 else if (ch >= 0x100) {
620 *str++ = 'u';
621 *str++ = Py_hexdigits[(ch>>12)&0xf];
622 *str++ = Py_hexdigits[(ch>>8)&0xf];
623 }
624 else
625 *str++ = 'x';
626 *str++ = Py_hexdigits[(ch>>4)&0xf];
627 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200628 }
629 return str;
630}
631
632/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
633 ASCII, Latin1, UTF-8, etc. */
634static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200635xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200636 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
637{
Victor Stinnerad771582015-10-09 12:38:53 +0200638 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200639 Py_UCS4 ch;
640 enum PyUnicode_Kind kind;
641 void *data;
642
643 assert(PyUnicode_IS_READY(unicode));
644 kind = PyUnicode_KIND(unicode);
645 data = PyUnicode_DATA(unicode);
646
647 size = 0;
648 /* determine replacement size */
649 for (i = collstart; i < collend; ++i) {
650 Py_ssize_t incr;
651
652 ch = PyUnicode_READ(kind, data, i);
653 if (ch < 10)
654 incr = 2+1+1;
655 else if (ch < 100)
656 incr = 2+2+1;
657 else if (ch < 1000)
658 incr = 2+3+1;
659 else if (ch < 10000)
660 incr = 2+4+1;
661 else if (ch < 100000)
662 incr = 2+5+1;
663 else if (ch < 1000000)
664 incr = 2+6+1;
665 else {
666 assert(ch <= MAX_UNICODE);
667 incr = 2+7+1;
668 }
669 if (size > PY_SSIZE_T_MAX - incr) {
670 PyErr_SetString(PyExc_OverflowError,
671 "encoded result is too long for a Python string");
672 return NULL;
673 }
674 size += incr;
675 }
676
Victor Stinnerad771582015-10-09 12:38:53 +0200677 str = _PyBytesWriter_Prepare(writer, str, size);
678 if (str == NULL)
679 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200680
681 /* generate replacement */
682 for (i = collstart; i < collend; ++i) {
683 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
684 }
685 return str;
686}
687
Thomas Wouters477c8d52006-05-27 19:21:47 +0000688/* --- Bloom Filters ----------------------------------------------------- */
689
690/* stuff to implement simple "bloom filters" for Unicode characters.
691 to keep things simple, we use a single bitmask, using the least 5
692 bits from each unicode characters as the bit index. */
693
694/* the linebreak mask is set up by Unicode_Init below */
695
Antoine Pitrouf068f942010-01-13 14:19:12 +0000696#if LONG_BIT >= 128
697#define BLOOM_WIDTH 128
698#elif LONG_BIT >= 64
699#define BLOOM_WIDTH 64
700#elif LONG_BIT >= 32
701#define BLOOM_WIDTH 32
702#else
703#error "LONG_BIT is smaller than 32"
704#endif
705
Thomas Wouters477c8d52006-05-27 19:21:47 +0000706#define BLOOM_MASK unsigned long
707
Serhiy Storchaka05997252013-01-26 12:14:02 +0200708static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000709
Antoine Pitrouf068f942010-01-13 14:19:12 +0000710#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000711
Benjamin Peterson29060642009-01-31 22:14:21 +0000712#define BLOOM_LINEBREAK(ch) \
713 ((ch) < 128U ? ascii_linebreak[(ch)] : \
714 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000715
Alexander Belopolsky40018472011-02-26 01:02:56 +0000716Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200717make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718{
Victor Stinnera85af502013-04-09 21:53:54 +0200719#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
720 do { \
721 TYPE *data = (TYPE *)PTR; \
722 TYPE *end = data + LEN; \
723 Py_UCS4 ch; \
724 for (; data != end; data++) { \
725 ch = *data; \
726 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
727 } \
728 break; \
729 } while (0)
730
Thomas Wouters477c8d52006-05-27 19:21:47 +0000731 /* calculate simple bloom-style bitmask for a given unicode string */
732
Antoine Pitrouf068f942010-01-13 14:19:12 +0000733 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734
735 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200736 switch (kind) {
737 case PyUnicode_1BYTE_KIND:
738 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
739 break;
740 case PyUnicode_2BYTE_KIND:
741 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
742 break;
743 case PyUnicode_4BYTE_KIND:
744 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
745 break;
746 default:
747 assert(0);
748 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000749 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200750
751#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000752}
753
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300754static int
755ensure_unicode(PyObject *obj)
756{
757 if (!PyUnicode_Check(obj)) {
758 PyErr_Format(PyExc_TypeError,
759 "must be str, not %.100s",
760 Py_TYPE(obj)->tp_name);
761 return -1;
762 }
763 return PyUnicode_READY(obj);
764}
765
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200766/* Compilation of templated routines */
767
768#include "stringlib/asciilib.h"
769#include "stringlib/fastsearch.h"
770#include "stringlib/partition.h"
771#include "stringlib/split.h"
772#include "stringlib/count.h"
773#include "stringlib/find.h"
774#include "stringlib/find_max_char.h"
775#include "stringlib/localeutil.h"
776#include "stringlib/undef.h"
777
778#include "stringlib/ucs1lib.h"
779#include "stringlib/fastsearch.h"
780#include "stringlib/partition.h"
781#include "stringlib/split.h"
782#include "stringlib/count.h"
783#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300784#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200785#include "stringlib/find_max_char.h"
786#include "stringlib/localeutil.h"
787#include "stringlib/undef.h"
788
789#include "stringlib/ucs2lib.h"
790#include "stringlib/fastsearch.h"
791#include "stringlib/partition.h"
792#include "stringlib/split.h"
793#include "stringlib/count.h"
794#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300795#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200796#include "stringlib/find_max_char.h"
797#include "stringlib/localeutil.h"
798#include "stringlib/undef.h"
799
800#include "stringlib/ucs4lib.h"
801#include "stringlib/fastsearch.h"
802#include "stringlib/partition.h"
803#include "stringlib/split.h"
804#include "stringlib/count.h"
805#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300806#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200807#include "stringlib/find_max_char.h"
808#include "stringlib/localeutil.h"
809#include "stringlib/undef.h"
810
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200811#include "stringlib/unicodedefs.h"
812#include "stringlib/fastsearch.h"
813#include "stringlib/count.h"
814#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100815#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200816
Guido van Rossumd57fd912000-03-10 22:53:23 +0000817/* --- Unicode Object ----------------------------------------------------- */
818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200819static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200820fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200822Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823 Py_ssize_t size, Py_UCS4 ch,
824 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200825{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200826 switch (kind) {
827 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200828 if ((Py_UCS1) ch != ch)
829 return -1;
830 if (direction > 0)
831 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
832 else
833 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200835 if ((Py_UCS2) ch != ch)
836 return -1;
837 if (direction > 0)
838 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
839 else
840 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200841 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200842 if (direction > 0)
843 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
844 else
845 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200846 default:
847 assert(0);
848 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200849 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200850}
851
Victor Stinnerafffce42012-10-03 23:03:17 +0200852#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000853/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200854 earlier.
855
856 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
857 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
858 invalid character in Unicode 6.0. */
859static void
860unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
861{
862 int kind = PyUnicode_KIND(unicode);
863 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
864 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
865 if (length <= old_length)
866 return;
867 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
868}
869#endif
870
Victor Stinnerfe226c02011-10-03 03:52:20 +0200871static PyObject*
872resize_compact(PyObject *unicode, Py_ssize_t length)
873{
874 Py_ssize_t char_size;
875 Py_ssize_t struct_size;
876 Py_ssize_t new_size;
877 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100878 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200879#ifdef Py_DEBUG
880 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
881#endif
882
Victor Stinner79891572012-05-03 13:43:07 +0200883 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200884 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100885 assert(PyUnicode_IS_COMPACT(unicode));
886
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200887 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100888 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200889 struct_size = sizeof(PyASCIIObject);
890 else
891 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200892 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200893
Victor Stinnerfe226c02011-10-03 03:52:20 +0200894 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
895 PyErr_NoMemory();
896 return NULL;
897 }
898 new_size = (struct_size + (length + 1) * char_size);
899
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200900 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
901 PyObject_DEL(_PyUnicode_UTF8(unicode));
902 _PyUnicode_UTF8(unicode) = NULL;
903 _PyUnicode_UTF8_LENGTH(unicode) = 0;
904 }
Victor Stinner84def372011-12-11 20:04:56 +0100905 _Py_DEC_REFTOTAL;
906 _Py_ForgetReference(unicode);
907
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300908 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100909 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100910 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200911 PyErr_NoMemory();
912 return NULL;
913 }
Victor Stinner84def372011-12-11 20:04:56 +0100914 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200915 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100916
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200918 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200919 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100920 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200921 _PyUnicode_WSTR_LENGTH(unicode) = length;
922 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100923 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
924 PyObject_DEL(_PyUnicode_WSTR(unicode));
925 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100926 if (!PyUnicode_IS_ASCII(unicode))
927 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100928 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200929#ifdef Py_DEBUG
930 unicode_fill_invalid(unicode, old_length);
931#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200932 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
933 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200934 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 return unicode;
936}
937
Alexander Belopolsky40018472011-02-26 01:02:56 +0000938static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200939resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000940{
Victor Stinner95663112011-10-04 01:03:50 +0200941 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100942 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200943 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200944 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000945
Victor Stinnerfe226c02011-10-03 03:52:20 +0200946 if (PyUnicode_IS_READY(unicode)) {
947 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200948 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200949 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200950#ifdef Py_DEBUG
951 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
952#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200953
954 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200955 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200956 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
957 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200958
959 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
960 PyErr_NoMemory();
961 return -1;
962 }
963 new_size = (length + 1) * char_size;
964
Victor Stinner7a9105a2011-12-12 00:13:42 +0100965 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
966 {
967 PyObject_DEL(_PyUnicode_UTF8(unicode));
968 _PyUnicode_UTF8(unicode) = NULL;
969 _PyUnicode_UTF8_LENGTH(unicode) = 0;
970 }
971
Victor Stinnerfe226c02011-10-03 03:52:20 +0200972 data = (PyObject *)PyObject_REALLOC(data, new_size);
973 if (data == NULL) {
974 PyErr_NoMemory();
975 return -1;
976 }
977 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200978 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200979 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200980 _PyUnicode_WSTR_LENGTH(unicode) = length;
981 }
982 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200983 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200984 _PyUnicode_UTF8_LENGTH(unicode) = length;
985 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200986 _PyUnicode_LENGTH(unicode) = length;
987 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200988#ifdef Py_DEBUG
989 unicode_fill_invalid(unicode, old_length);
990#endif
Victor Stinner95663112011-10-04 01:03:50 +0200991 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200992 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200993 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200995 }
Victor Stinner95663112011-10-04 01:03:50 +0200996 assert(_PyUnicode_WSTR(unicode) != NULL);
997
998 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700999 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001000 PyErr_NoMemory();
1001 return -1;
1002 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001003 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001004 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001005 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001006 if (!wstr) {
1007 PyErr_NoMemory();
1008 return -1;
1009 }
1010 _PyUnicode_WSTR(unicode) = wstr;
1011 _PyUnicode_WSTR(unicode)[length] = 0;
1012 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001013 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001014 return 0;
1015}
1016
Victor Stinnerfe226c02011-10-03 03:52:20 +02001017static PyObject*
1018resize_copy(PyObject *unicode, Py_ssize_t length)
1019{
1020 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001021 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001022 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001023
Benjamin Petersonbac79492012-01-14 13:34:47 -05001024 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001025 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001026
1027 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1028 if (copy == NULL)
1029 return NULL;
1030
1031 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001032 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001033 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001034 }
1035 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001036 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001037
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001038 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001039 if (w == NULL)
1040 return NULL;
1041 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1042 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001043 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1044 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001045 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 }
1047}
1048
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001050 Ux0000 terminated; some code (e.g. new_identifier)
1051 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052
1053 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001054 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001055
1056*/
1057
Alexander Belopolsky40018472011-02-26 01:02:56 +00001058static PyUnicodeObject *
1059_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001061 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063
Thomas Wouters477c8d52006-05-27 19:21:47 +00001064 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001065 if (length == 0 && unicode_empty != NULL) {
1066 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001067 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068 }
1069
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001070 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001071 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001072 return (PyUnicodeObject *)PyErr_NoMemory();
1073 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074 if (length < 0) {
1075 PyErr_SetString(PyExc_SystemError,
1076 "Negative size passed to _PyUnicode_New");
1077 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078 }
1079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1081 if (unicode == NULL)
1082 return NULL;
1083 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001084
1085 _PyUnicode_WSTR_LENGTH(unicode) = length;
1086 _PyUnicode_HASH(unicode) = -1;
1087 _PyUnicode_STATE(unicode).interned = 0;
1088 _PyUnicode_STATE(unicode).kind = 0;
1089 _PyUnicode_STATE(unicode).compact = 0;
1090 _PyUnicode_STATE(unicode).ready = 0;
1091 _PyUnicode_STATE(unicode).ascii = 0;
1092 _PyUnicode_DATA_ANY(unicode) = NULL;
1093 _PyUnicode_LENGTH(unicode) = 0;
1094 _PyUnicode_UTF8(unicode) = NULL;
1095 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1098 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001099 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001100 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001101 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001102 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103
Jeremy Hyltond8082792003-09-16 19:41:39 +00001104 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001105 * the caller fails before initializing str -- unicode_resize()
1106 * reads str[0], and the Keep-Alive optimization can keep memory
1107 * allocated for str alive across a call to unicode_dealloc(unicode).
1108 * We don't want unicode_resize to read uninitialized memory in
1109 * that case.
1110 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111 _PyUnicode_WSTR(unicode)[0] = 0;
1112 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001113
Victor Stinner7931d9a2011-11-04 00:22:48 +01001114 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 return unicode;
1116}
1117
Victor Stinnerf42dc442011-10-02 23:33:16 +02001118static const char*
1119unicode_kind_name(PyObject *unicode)
1120{
Victor Stinner42dfd712011-10-03 14:41:45 +02001121 /* don't check consistency: unicode_kind_name() is called from
1122 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001123 if (!PyUnicode_IS_COMPACT(unicode))
1124 {
1125 if (!PyUnicode_IS_READY(unicode))
1126 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001127 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001128 {
1129 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001130 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131 return "legacy ascii";
1132 else
1133 return "legacy latin1";
1134 case PyUnicode_2BYTE_KIND:
1135 return "legacy UCS2";
1136 case PyUnicode_4BYTE_KIND:
1137 return "legacy UCS4";
1138 default:
1139 return "<legacy invalid kind>";
1140 }
1141 }
1142 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001143 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001144 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001145 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001146 return "ascii";
1147 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001148 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001149 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001150 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001152 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001153 default:
1154 return "<invalid compact kind>";
1155 }
1156}
1157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001158#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159/* Functions wrapping macros for use in debugger */
1160char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001161 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162}
1163
1164void *_PyUnicode_compact_data(void *unicode) {
1165 return _PyUnicode_COMPACT_DATA(unicode);
1166}
1167void *_PyUnicode_data(void *unicode){
1168 printf("obj %p\n", unicode);
1169 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1170 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1171 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1172 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1173 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1174 return PyUnicode_DATA(unicode);
1175}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001176
1177void
1178_PyUnicode_Dump(PyObject *op)
1179{
1180 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001181 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1182 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1183 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001184
Victor Stinnera849a4b2011-10-03 12:12:11 +02001185 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001186 {
1187 if (ascii->state.ascii)
1188 data = (ascii + 1);
1189 else
1190 data = (compact + 1);
1191 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001192 else
1193 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001194 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1195 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001196
Victor Stinnera849a4b2011-10-03 12:12:11 +02001197 if (ascii->wstr == data)
1198 printf("shared ");
1199 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001200
Victor Stinnera3b334d2011-10-03 13:53:37 +02001201 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001202 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001203 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1204 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001205 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1206 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001207 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001208 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001209}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001210#endif
1211
1212PyObject *
1213PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1214{
1215 PyObject *obj;
1216 PyCompactUnicodeObject *unicode;
1217 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001218 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001219 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001220 Py_ssize_t char_size;
1221 Py_ssize_t struct_size;
1222
1223 /* Optimization for empty strings */
1224 if (size == 0 && unicode_empty != NULL) {
1225 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001226 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001227 }
1228
Victor Stinner9e9d6892011-10-04 01:02:02 +02001229 is_ascii = 0;
1230 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001231 struct_size = sizeof(PyCompactUnicodeObject);
1232 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001233 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234 char_size = 1;
1235 is_ascii = 1;
1236 struct_size = sizeof(PyASCIIObject);
1237 }
1238 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001239 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001240 char_size = 1;
1241 }
1242 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001243 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001244 char_size = 2;
1245 if (sizeof(wchar_t) == 2)
1246 is_sharing = 1;
1247 }
1248 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001249 if (maxchar > MAX_UNICODE) {
1250 PyErr_SetString(PyExc_SystemError,
1251 "invalid maximum character passed to PyUnicode_New");
1252 return NULL;
1253 }
Victor Stinner8f825062012-04-27 13:55:39 +02001254 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001255 char_size = 4;
1256 if (sizeof(wchar_t) == 4)
1257 is_sharing = 1;
1258 }
1259
1260 /* Ensure we won't overflow the size. */
1261 if (size < 0) {
1262 PyErr_SetString(PyExc_SystemError,
1263 "Negative size passed to PyUnicode_New");
1264 return NULL;
1265 }
1266 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1267 return PyErr_NoMemory();
1268
1269 /* Duplicated allocation code from _PyObject_New() instead of a call to
1270 * PyObject_New() so we are able to allocate space for the object and
1271 * it's data buffer.
1272 */
1273 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1274 if (obj == NULL)
1275 return PyErr_NoMemory();
1276 obj = PyObject_INIT(obj, &PyUnicode_Type);
1277 if (obj == NULL)
1278 return NULL;
1279
1280 unicode = (PyCompactUnicodeObject *)obj;
1281 if (is_ascii)
1282 data = ((PyASCIIObject*)obj) + 1;
1283 else
1284 data = unicode + 1;
1285 _PyUnicode_LENGTH(unicode) = size;
1286 _PyUnicode_HASH(unicode) = -1;
1287 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001288 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289 _PyUnicode_STATE(unicode).compact = 1;
1290 _PyUnicode_STATE(unicode).ready = 1;
1291 _PyUnicode_STATE(unicode).ascii = is_ascii;
1292 if (is_ascii) {
1293 ((char*)data)[size] = 0;
1294 _PyUnicode_WSTR(unicode) = NULL;
1295 }
Victor Stinner8f825062012-04-27 13:55:39 +02001296 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 ((char*)data)[size] = 0;
1298 _PyUnicode_WSTR(unicode) = NULL;
1299 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001300 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001301 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001302 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001303 else {
1304 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001305 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001306 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001308 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309 ((Py_UCS4*)data)[size] = 0;
1310 if (is_sharing) {
1311 _PyUnicode_WSTR_LENGTH(unicode) = size;
1312 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1313 }
1314 else {
1315 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1316 _PyUnicode_WSTR(unicode) = NULL;
1317 }
1318 }
Victor Stinner8f825062012-04-27 13:55:39 +02001319#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001320 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001321#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001322 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 return obj;
1324}
1325
1326#if SIZEOF_WCHAR_T == 2
1327/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1328 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001329 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330
1331 This function assumes that unicode can hold one more code point than wstr
1332 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001333static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001335 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336{
1337 const wchar_t *iter;
1338 Py_UCS4 *ucs4_out;
1339
Victor Stinner910337b2011-10-03 03:20:16 +02001340 assert(unicode != NULL);
1341 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1343 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1344
1345 for (iter = begin; iter < end; ) {
1346 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1347 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001348 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1349 && (iter+1) < end
1350 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 {
Victor Stinner551ac952011-11-29 22:58:13 +01001352 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353 iter += 2;
1354 }
1355 else {
1356 *ucs4_out++ = *iter;
1357 iter++;
1358 }
1359 }
1360 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1361 _PyUnicode_GET_LENGTH(unicode)));
1362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363}
1364#endif
1365
Victor Stinnercd9950f2011-10-02 00:34:53 +02001366static int
Victor Stinner488fa492011-12-12 00:01:39 +01001367unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001368{
Victor Stinner488fa492011-12-12 00:01:39 +01001369 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001370 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001371 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001372 return -1;
1373 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001374 return 0;
1375}
1376
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001377static int
1378_copy_characters(PyObject *to, Py_ssize_t to_start,
1379 PyObject *from, Py_ssize_t from_start,
1380 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001382 unsigned int from_kind, to_kind;
1383 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384
Victor Stinneree4544c2012-05-09 22:24:08 +02001385 assert(0 <= how_many);
1386 assert(0 <= from_start);
1387 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001388 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001389 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001390 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391
Victor Stinnerd3f08822012-05-29 12:57:52 +02001392 assert(PyUnicode_Check(to));
1393 assert(PyUnicode_IS_READY(to));
1394 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1395
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001396 if (how_many == 0)
1397 return 0;
1398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001400 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001402 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403
Victor Stinnerf1852262012-06-16 16:38:26 +02001404#ifdef Py_DEBUG
1405 if (!check_maxchar
1406 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1407 {
1408 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1409 Py_UCS4 ch;
1410 Py_ssize_t i;
1411 for (i=0; i < how_many; i++) {
1412 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1413 assert(ch <= to_maxchar);
1414 }
1415 }
1416#endif
1417
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001418 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001419 if (check_maxchar
1420 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1421 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001422 /* Writing Latin-1 characters into an ASCII string requires to
1423 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001424 Py_UCS4 max_char;
1425 max_char = ucs1lib_find_max_char(from_data,
1426 (Py_UCS1*)from_data + how_many);
1427 if (max_char >= 128)
1428 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001429 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001430 Py_MEMCPY((char*)to_data + to_kind * to_start,
1431 (char*)from_data + from_kind * from_start,
1432 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001434 else if (from_kind == PyUnicode_1BYTE_KIND
1435 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001436 {
1437 _PyUnicode_CONVERT_BYTES(
1438 Py_UCS1, Py_UCS2,
1439 PyUnicode_1BYTE_DATA(from) + from_start,
1440 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1441 PyUnicode_2BYTE_DATA(to) + to_start
1442 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001443 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001444 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001445 && to_kind == PyUnicode_4BYTE_KIND)
1446 {
1447 _PyUnicode_CONVERT_BYTES(
1448 Py_UCS1, Py_UCS4,
1449 PyUnicode_1BYTE_DATA(from) + from_start,
1450 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1451 PyUnicode_4BYTE_DATA(to) + to_start
1452 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001453 }
1454 else if (from_kind == PyUnicode_2BYTE_KIND
1455 && to_kind == PyUnicode_4BYTE_KIND)
1456 {
1457 _PyUnicode_CONVERT_BYTES(
1458 Py_UCS2, Py_UCS4,
1459 PyUnicode_2BYTE_DATA(from) + from_start,
1460 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1461 PyUnicode_4BYTE_DATA(to) + to_start
1462 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001463 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001464 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001465 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1466
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001467 if (!check_maxchar) {
1468 if (from_kind == PyUnicode_2BYTE_KIND
1469 && to_kind == PyUnicode_1BYTE_KIND)
1470 {
1471 _PyUnicode_CONVERT_BYTES(
1472 Py_UCS2, Py_UCS1,
1473 PyUnicode_2BYTE_DATA(from) + from_start,
1474 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1475 PyUnicode_1BYTE_DATA(to) + to_start
1476 );
1477 }
1478 else if (from_kind == PyUnicode_4BYTE_KIND
1479 && to_kind == PyUnicode_1BYTE_KIND)
1480 {
1481 _PyUnicode_CONVERT_BYTES(
1482 Py_UCS4, Py_UCS1,
1483 PyUnicode_4BYTE_DATA(from) + from_start,
1484 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1485 PyUnicode_1BYTE_DATA(to) + to_start
1486 );
1487 }
1488 else if (from_kind == PyUnicode_4BYTE_KIND
1489 && to_kind == PyUnicode_2BYTE_KIND)
1490 {
1491 _PyUnicode_CONVERT_BYTES(
1492 Py_UCS4, Py_UCS2,
1493 PyUnicode_4BYTE_DATA(from) + from_start,
1494 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1495 PyUnicode_2BYTE_DATA(to) + to_start
1496 );
1497 }
1498 else {
1499 assert(0);
1500 return -1;
1501 }
1502 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001503 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001504 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001505 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001506 Py_ssize_t i;
1507
Victor Stinnera0702ab2011-09-29 14:14:38 +02001508 for (i=0; i < how_many; i++) {
1509 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001510 if (ch > to_maxchar)
1511 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1513 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001514 }
1515 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001516 return 0;
1517}
1518
Victor Stinnerd3f08822012-05-29 12:57:52 +02001519void
1520_PyUnicode_FastCopyCharacters(
1521 PyObject *to, Py_ssize_t to_start,
1522 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001523{
1524 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1525}
1526
1527Py_ssize_t
1528PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1529 PyObject *from, Py_ssize_t from_start,
1530 Py_ssize_t how_many)
1531{
1532 int err;
1533
1534 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1535 PyErr_BadInternalCall();
1536 return -1;
1537 }
1538
Benjamin Petersonbac79492012-01-14 13:34:47 -05001539 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001540 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001541 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001542 return -1;
1543
Victor Stinnerd3f08822012-05-29 12:57:52 +02001544 if (from_start < 0) {
1545 PyErr_SetString(PyExc_IndexError, "string index out of range");
1546 return -1;
1547 }
1548 if (to_start < 0) {
1549 PyErr_SetString(PyExc_IndexError, "string index out of range");
1550 return -1;
1551 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001552 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1553 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1554 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001555 "Cannot write %zi characters at %zi "
1556 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001557 how_many, to_start, PyUnicode_GET_LENGTH(to));
1558 return -1;
1559 }
1560
1561 if (how_many == 0)
1562 return 0;
1563
Victor Stinner488fa492011-12-12 00:01:39 +01001564 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 return -1;
1566
1567 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1568 if (err) {
1569 PyErr_Format(PyExc_SystemError,
1570 "Cannot copy %s characters "
1571 "into a string of %s characters",
1572 unicode_kind_name(from),
1573 unicode_kind_name(to));
1574 return -1;
1575 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001576 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001577}
1578
Victor Stinner17222162011-09-28 22:15:37 +02001579/* Find the maximum code point and count the number of surrogate pairs so a
1580 correct string length can be computed before converting a string to UCS4.
1581 This function counts single surrogates as a character and not as a pair.
1582
1583 Return 0 on success, or -1 on error. */
1584static int
1585find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1586 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001587{
1588 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001589 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001590
Victor Stinnerc53be962011-10-02 21:33:54 +02001591 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001592 *num_surrogates = 0;
1593 *maxchar = 0;
1594
1595 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001596#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001597 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1598 && (iter+1) < end
1599 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1600 {
1601 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1602 ++(*num_surrogates);
1603 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604 }
1605 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001606#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001607 {
1608 ch = *iter;
1609 iter++;
1610 }
1611 if (ch > *maxchar) {
1612 *maxchar = ch;
1613 if (*maxchar > MAX_UNICODE) {
1614 PyErr_Format(PyExc_ValueError,
1615 "character U+%x is not in range [U+0000; U+10ffff]",
1616 ch);
1617 return -1;
1618 }
1619 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001620 }
1621 return 0;
1622}
1623
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001624int
1625_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001626{
1627 wchar_t *end;
1628 Py_UCS4 maxchar = 0;
1629 Py_ssize_t num_surrogates;
1630#if SIZEOF_WCHAR_T == 2
1631 Py_ssize_t length_wo_surrogates;
1632#endif
1633
Georg Brandl7597add2011-10-05 16:36:47 +02001634 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001635 strings were created using _PyObject_New() and where no canonical
1636 representation (the str field) has been set yet aka strings
1637 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001638 assert(_PyUnicode_CHECK(unicode));
1639 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001641 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001642 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001643 /* Actually, it should neither be interned nor be anything else: */
1644 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001647 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001648 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001649 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650
1651 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001652 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1653 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001654 PyErr_NoMemory();
1655 return -1;
1656 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001657 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 _PyUnicode_WSTR(unicode), end,
1659 PyUnicode_1BYTE_DATA(unicode));
1660 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1661 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1662 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1663 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001664 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001665 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001666 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 }
1668 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001669 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001670 _PyUnicode_UTF8(unicode) = NULL;
1671 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 }
1673 PyObject_FREE(_PyUnicode_WSTR(unicode));
1674 _PyUnicode_WSTR(unicode) = NULL;
1675 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1676 }
1677 /* In this case we might have to convert down from 4-byte native
1678 wchar_t to 2-byte unicode. */
1679 else if (maxchar < 65536) {
1680 assert(num_surrogates == 0 &&
1681 "FindMaxCharAndNumSurrogatePairs() messed up");
1682
Victor Stinner506f5922011-09-28 22:34:18 +02001683#if SIZEOF_WCHAR_T == 2
1684 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001685 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001686 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1687 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1688 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001689 _PyUnicode_UTF8(unicode) = NULL;
1690 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001691#else
1692 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001693 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001694 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001695 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001696 PyErr_NoMemory();
1697 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 }
Victor Stinner506f5922011-09-28 22:34:18 +02001699 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1700 _PyUnicode_WSTR(unicode), end,
1701 PyUnicode_2BYTE_DATA(unicode));
1702 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1703 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1704 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001705 _PyUnicode_UTF8(unicode) = NULL;
1706 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001707 PyObject_FREE(_PyUnicode_WSTR(unicode));
1708 _PyUnicode_WSTR(unicode) = NULL;
1709 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1710#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001711 }
1712 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1713 else {
1714#if SIZEOF_WCHAR_T == 2
1715 /* in case the native representation is 2-bytes, we need to allocate a
1716 new normalized 4-byte version. */
1717 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001718 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1719 PyErr_NoMemory();
1720 return -1;
1721 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001722 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1723 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724 PyErr_NoMemory();
1725 return -1;
1726 }
1727 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1728 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001729 _PyUnicode_UTF8(unicode) = NULL;
1730 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001731 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1732 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001733 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734 PyObject_FREE(_PyUnicode_WSTR(unicode));
1735 _PyUnicode_WSTR(unicode) = NULL;
1736 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1737#else
1738 assert(num_surrogates == 0);
1739
Victor Stinnerc3c74152011-10-02 20:39:55 +02001740 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001742 _PyUnicode_UTF8(unicode) = NULL;
1743 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1745#endif
1746 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1747 }
1748 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001749 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 return 0;
1751}
1752
Alexander Belopolsky40018472011-02-26 01:02:56 +00001753static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001754unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755{
Walter Dörwald16807132007-05-25 13:52:07 +00001756 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001757 case SSTATE_NOT_INTERNED:
1758 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001759
Benjamin Peterson29060642009-01-31 22:14:21 +00001760 case SSTATE_INTERNED_MORTAL:
1761 /* revive dead object temporarily for DelItem */
1762 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001763 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001764 Py_FatalError(
1765 "deletion of interned string failed");
1766 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001767
Benjamin Peterson29060642009-01-31 22:14:21 +00001768 case SSTATE_INTERNED_IMMORTAL:
1769 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001770
Benjamin Peterson29060642009-01-31 22:14:21 +00001771 default:
1772 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001773 }
1774
Victor Stinner03490912011-10-03 23:45:12 +02001775 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001777 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001778 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001779 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1780 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001782 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783}
1784
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001785#ifdef Py_DEBUG
1786static int
1787unicode_is_singleton(PyObject *unicode)
1788{
1789 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1790 if (unicode == unicode_empty)
1791 return 1;
1792 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1793 {
1794 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1795 if (ch < 256 && unicode_latin1[ch] == unicode)
1796 return 1;
1797 }
1798 return 0;
1799}
1800#endif
1801
Alexander Belopolsky40018472011-02-26 01:02:56 +00001802static int
Victor Stinner488fa492011-12-12 00:01:39 +01001803unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001804{
Victor Stinner488fa492011-12-12 00:01:39 +01001805 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001806 if (Py_REFCNT(unicode) != 1)
1807 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001808 if (_PyUnicode_HASH(unicode) != -1)
1809 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001810 if (PyUnicode_CHECK_INTERNED(unicode))
1811 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001812 if (!PyUnicode_CheckExact(unicode))
1813 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001814#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001815 /* singleton refcount is greater than 1 */
1816 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001817#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001818 return 1;
1819}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001820
Victor Stinnerfe226c02011-10-03 03:52:20 +02001821static int
1822unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1823{
1824 PyObject *unicode;
1825 Py_ssize_t old_length;
1826
1827 assert(p_unicode != NULL);
1828 unicode = *p_unicode;
1829
1830 assert(unicode != NULL);
1831 assert(PyUnicode_Check(unicode));
1832 assert(0 <= length);
1833
Victor Stinner910337b2011-10-03 03:20:16 +02001834 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001835 old_length = PyUnicode_WSTR_LENGTH(unicode);
1836 else
1837 old_length = PyUnicode_GET_LENGTH(unicode);
1838 if (old_length == length)
1839 return 0;
1840
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001841 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001842 _Py_INCREF_UNICODE_EMPTY();
1843 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001844 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001845 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001846 return 0;
1847 }
1848
Victor Stinner488fa492011-12-12 00:01:39 +01001849 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001850 PyObject *copy = resize_copy(unicode, length);
1851 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001852 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001853 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001854 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001855 }
1856
Victor Stinnerfe226c02011-10-03 03:52:20 +02001857 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001858 PyObject *new_unicode = resize_compact(unicode, length);
1859 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001860 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001861 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001863 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001864 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001865}
1866
Alexander Belopolsky40018472011-02-26 01:02:56 +00001867int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001868PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001869{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001870 PyObject *unicode;
1871 if (p_unicode == NULL) {
1872 PyErr_BadInternalCall();
1873 return -1;
1874 }
1875 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001876 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001877 {
1878 PyErr_BadInternalCall();
1879 return -1;
1880 }
1881 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001882}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001883
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001884/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001885
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001886 WARNING: The function doesn't copy the terminating null character and
1887 doesn't check the maximum character (may write a latin1 character in an
1888 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001889static void
1890unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1891 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001892{
1893 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1894 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001895 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001896
1897 switch (kind) {
1898 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001899 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001900#ifdef Py_DEBUG
1901 if (PyUnicode_IS_ASCII(unicode)) {
1902 Py_UCS4 maxchar = ucs1lib_find_max_char(
1903 (const Py_UCS1*)str,
1904 (const Py_UCS1*)str + len);
1905 assert(maxchar < 128);
1906 }
1907#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001908 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001909 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001910 }
1911 case PyUnicode_2BYTE_KIND: {
1912 Py_UCS2 *start = (Py_UCS2 *)data + index;
1913 Py_UCS2 *ucs2 = start;
1914 assert(index <= PyUnicode_GET_LENGTH(unicode));
1915
Victor Stinner184252a2012-06-16 02:57:41 +02001916 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001917 *ucs2 = (Py_UCS2)*str;
1918
1919 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001920 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001921 }
1922 default: {
1923 Py_UCS4 *start = (Py_UCS4 *)data + index;
1924 Py_UCS4 *ucs4 = start;
1925 assert(kind == PyUnicode_4BYTE_KIND);
1926 assert(index <= PyUnicode_GET_LENGTH(unicode));
1927
Victor Stinner184252a2012-06-16 02:57:41 +02001928 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001929 *ucs4 = (Py_UCS4)*str;
1930
1931 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001932 }
1933 }
1934}
1935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936static PyObject*
1937get_latin1_char(unsigned char ch)
1938{
Victor Stinnera464fc12011-10-02 20:39:30 +02001939 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001941 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001942 if (!unicode)
1943 return NULL;
1944 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001945 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001946 unicode_latin1[ch] = unicode;
1947 }
1948 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001949 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950}
1951
Victor Stinner985a82a2014-01-03 12:53:47 +01001952static PyObject*
1953unicode_char(Py_UCS4 ch)
1954{
1955 PyObject *unicode;
1956
1957 assert(ch <= MAX_UNICODE);
1958
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001959 if (ch < 256)
1960 return get_latin1_char(ch);
1961
Victor Stinner985a82a2014-01-03 12:53:47 +01001962 unicode = PyUnicode_New(1, ch);
1963 if (unicode == NULL)
1964 return NULL;
1965 switch (PyUnicode_KIND(unicode)) {
1966 case PyUnicode_1BYTE_KIND:
1967 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1968 break;
1969 case PyUnicode_2BYTE_KIND:
1970 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1971 break;
1972 default:
1973 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1974 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1975 }
1976 assert(_PyUnicode_CheckConsistency(unicode, 1));
1977 return unicode;
1978}
1979
Alexander Belopolsky40018472011-02-26 01:02:56 +00001980PyObject *
1981PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001983 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984 Py_UCS4 maxchar = 0;
1985 Py_ssize_t num_surrogates;
1986
1987 if (u == NULL)
1988 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001990 /* If the Unicode data is known at construction time, we can apply
1991 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001993 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001994 if (size == 0)
1995 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 /* Single character Unicode objects in the Latin-1 range are
1998 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001999 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 return get_latin1_char((unsigned char)*u);
2001
2002 /* If not empty and not single character, copy the Unicode data
2003 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002004 if (find_maxchar_surrogates(u, u + size,
2005 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002006 return NULL;
2007
Victor Stinner8faf8212011-12-08 22:14:11 +01002008 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009 if (!unicode)
2010 return NULL;
2011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 switch (PyUnicode_KIND(unicode)) {
2013 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002014 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002015 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2016 break;
2017 case PyUnicode_2BYTE_KIND:
2018#if Py_UNICODE_SIZE == 2
2019 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2020#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002021 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2023#endif
2024 break;
2025 case PyUnicode_4BYTE_KIND:
2026#if SIZEOF_WCHAR_T == 2
2027 /* This is the only case which has to process surrogates, thus
2028 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002029 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030#else
2031 assert(num_surrogates == 0);
2032 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2033#endif
2034 break;
2035 default:
2036 assert(0 && "Impossible state");
2037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002039 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040}
2041
Alexander Belopolsky40018472011-02-26 01:02:56 +00002042PyObject *
2043PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002044{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002045 if (size < 0) {
2046 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002047 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002048 return NULL;
2049 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002050 if (u != NULL)
2051 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2052 else
2053 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002054}
2055
Alexander Belopolsky40018472011-02-26 01:02:56 +00002056PyObject *
2057PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002058{
2059 size_t size = strlen(u);
2060 if (size > PY_SSIZE_T_MAX) {
2061 PyErr_SetString(PyExc_OverflowError, "input too long");
2062 return NULL;
2063 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002064 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002065}
2066
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002067PyObject *
2068_PyUnicode_FromId(_Py_Identifier *id)
2069{
2070 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002071 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2072 strlen(id->string),
2073 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002074 if (!id->object)
2075 return NULL;
2076 PyUnicode_InternInPlace(&id->object);
2077 assert(!id->next);
2078 id->next = static_strings;
2079 static_strings = id;
2080 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002081 return id->object;
2082}
2083
2084void
2085_PyUnicode_ClearStaticStrings()
2086{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002087 _Py_Identifier *tmp, *s = static_strings;
2088 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002089 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002090 tmp = s->next;
2091 s->next = NULL;
2092 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002093 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002094 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002095}
2096
Benjamin Peterson0df54292012-03-26 14:50:32 -04002097/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002098
Victor Stinnerd3f08822012-05-29 12:57:52 +02002099PyObject*
2100_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002101{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002102 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002103 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002104 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002105#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002106 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002107#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002108 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002109 }
Victor Stinner785938e2011-12-11 20:09:03 +01002110 unicode = PyUnicode_New(size, 127);
2111 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002112 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002113 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2114 assert(_PyUnicode_CheckConsistency(unicode, 1));
2115 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002116}
2117
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002118static Py_UCS4
2119kind_maxchar_limit(unsigned int kind)
2120{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002121 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002122 case PyUnicode_1BYTE_KIND:
2123 return 0x80;
2124 case PyUnicode_2BYTE_KIND:
2125 return 0x100;
2126 case PyUnicode_4BYTE_KIND:
2127 return 0x10000;
2128 default:
2129 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002130 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002131 }
2132}
2133
Victor Stinnere6abb482012-05-02 01:15:40 +02002134Py_LOCAL_INLINE(Py_UCS4)
2135align_maxchar(Py_UCS4 maxchar)
2136{
2137 if (maxchar <= 127)
2138 return 127;
2139 else if (maxchar <= 255)
2140 return 255;
2141 else if (maxchar <= 65535)
2142 return 65535;
2143 else
2144 return MAX_UNICODE;
2145}
2146
Victor Stinner702c7342011-10-05 13:50:52 +02002147static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002148_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002149{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002150 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002151 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002152
Serhiy Storchaka678db842013-01-26 12:16:36 +02002153 if (size == 0)
2154 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002155 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002156 if (size == 1)
2157 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002158
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002159 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002160 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002161 if (!res)
2162 return NULL;
2163 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002164 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002166}
2167
Victor Stinnere57b1c02011-09-28 22:20:48 +02002168static PyObject*
2169_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170{
2171 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002172 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002173
Serhiy Storchaka678db842013-01-26 12:16:36 +02002174 if (size == 0)
2175 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002176 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002177 if (size == 1)
2178 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002179
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002180 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002181 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182 if (!res)
2183 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002184 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002186 else {
2187 _PyUnicode_CONVERT_BYTES(
2188 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2189 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002190 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002191 return res;
2192}
2193
Victor Stinnere57b1c02011-09-28 22:20:48 +02002194static PyObject*
2195_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196{
2197 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002198 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002199
Serhiy Storchaka678db842013-01-26 12:16:36 +02002200 if (size == 0)
2201 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002202 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002203 if (size == 1)
2204 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002205
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002206 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002207 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 if (!res)
2209 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002210 if (max_char < 256)
2211 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2212 PyUnicode_1BYTE_DATA(res));
2213 else if (max_char < 0x10000)
2214 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2215 PyUnicode_2BYTE_DATA(res));
2216 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002218 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 return res;
2220}
2221
2222PyObject*
2223PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2224{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002225 if (size < 0) {
2226 PyErr_SetString(PyExc_ValueError, "size must be positive");
2227 return NULL;
2228 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002229 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002230 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002231 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002233 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002235 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002236 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002237 PyErr_SetString(PyExc_SystemError, "invalid kind");
2238 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240}
2241
Victor Stinnerece58de2012-04-23 23:36:38 +02002242Py_UCS4
2243_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2244{
2245 enum PyUnicode_Kind kind;
2246 void *startptr, *endptr;
2247
2248 assert(PyUnicode_IS_READY(unicode));
2249 assert(0 <= start);
2250 assert(end <= PyUnicode_GET_LENGTH(unicode));
2251 assert(start <= end);
2252
2253 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2254 return PyUnicode_MAX_CHAR_VALUE(unicode);
2255
2256 if (start == end)
2257 return 127;
2258
Victor Stinner94d558b2012-04-27 22:26:58 +02002259 if (PyUnicode_IS_ASCII(unicode))
2260 return 127;
2261
Victor Stinnerece58de2012-04-23 23:36:38 +02002262 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002263 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002264 endptr = (char *)startptr + end * kind;
2265 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002266 switch(kind) {
2267 case PyUnicode_1BYTE_KIND:
2268 return ucs1lib_find_max_char(startptr, endptr);
2269 case PyUnicode_2BYTE_KIND:
2270 return ucs2lib_find_max_char(startptr, endptr);
2271 case PyUnicode_4BYTE_KIND:
2272 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002273 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002274 assert(0);
2275 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002276 }
2277}
2278
Victor Stinner25a4b292011-10-06 12:31:55 +02002279/* Ensure that a string uses the most efficient storage, if it is not the
2280 case: create a new string with of the right kind. Write NULL into *p_unicode
2281 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002282static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002283unicode_adjust_maxchar(PyObject **p_unicode)
2284{
2285 PyObject *unicode, *copy;
2286 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002287 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002288 unsigned int kind;
2289
2290 assert(p_unicode != NULL);
2291 unicode = *p_unicode;
2292 assert(PyUnicode_IS_READY(unicode));
2293 if (PyUnicode_IS_ASCII(unicode))
2294 return;
2295
2296 len = PyUnicode_GET_LENGTH(unicode);
2297 kind = PyUnicode_KIND(unicode);
2298 if (kind == PyUnicode_1BYTE_KIND) {
2299 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002300 max_char = ucs1lib_find_max_char(u, u + len);
2301 if (max_char >= 128)
2302 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002303 }
2304 else if (kind == PyUnicode_2BYTE_KIND) {
2305 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002306 max_char = ucs2lib_find_max_char(u, u + len);
2307 if (max_char >= 256)
2308 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002309 }
2310 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002311 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002312 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002313 max_char = ucs4lib_find_max_char(u, u + len);
2314 if (max_char >= 0x10000)
2315 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002316 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002317 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002318 if (copy != NULL)
2319 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002320 Py_DECREF(unicode);
2321 *p_unicode = copy;
2322}
2323
Victor Stinner034f6cf2011-09-30 02:26:44 +02002324PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002325_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002326{
Victor Stinner87af4f22011-11-21 23:03:47 +01002327 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002328 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002329
Victor Stinner034f6cf2011-09-30 02:26:44 +02002330 if (!PyUnicode_Check(unicode)) {
2331 PyErr_BadInternalCall();
2332 return NULL;
2333 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002334 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002335 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002336
Victor Stinner87af4f22011-11-21 23:03:47 +01002337 length = PyUnicode_GET_LENGTH(unicode);
2338 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002339 if (!copy)
2340 return NULL;
2341 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2342
Victor Stinner87af4f22011-11-21 23:03:47 +01002343 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2344 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002345 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002346 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002347}
2348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002349
Victor Stinnerbc603d12011-10-02 01:00:40 +02002350/* Widen Unicode objects to larger buffers. Don't write terminating null
2351 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002352
2353void*
2354_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2355{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002356 Py_ssize_t len;
2357 void *result;
2358 unsigned int skind;
2359
Benjamin Petersonbac79492012-01-14 13:34:47 -05002360 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002361 return NULL;
2362
2363 len = PyUnicode_GET_LENGTH(s);
2364 skind = PyUnicode_KIND(s);
2365 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002366 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002367 return NULL;
2368 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002369 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002370 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002371 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002372 if (!result)
2373 return PyErr_NoMemory();
2374 assert(skind == PyUnicode_1BYTE_KIND);
2375 _PyUnicode_CONVERT_BYTES(
2376 Py_UCS1, Py_UCS2,
2377 PyUnicode_1BYTE_DATA(s),
2378 PyUnicode_1BYTE_DATA(s) + len,
2379 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002380 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002381 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002382 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002383 if (!result)
2384 return PyErr_NoMemory();
2385 if (skind == PyUnicode_2BYTE_KIND) {
2386 _PyUnicode_CONVERT_BYTES(
2387 Py_UCS2, Py_UCS4,
2388 PyUnicode_2BYTE_DATA(s),
2389 PyUnicode_2BYTE_DATA(s) + len,
2390 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002392 else {
2393 assert(skind == PyUnicode_1BYTE_KIND);
2394 _PyUnicode_CONVERT_BYTES(
2395 Py_UCS1, Py_UCS4,
2396 PyUnicode_1BYTE_DATA(s),
2397 PyUnicode_1BYTE_DATA(s) + len,
2398 result);
2399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002401 default:
2402 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403 }
Victor Stinner01698042011-10-04 00:04:26 +02002404 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002405 return NULL;
2406}
2407
2408static Py_UCS4*
2409as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2410 int copy_null)
2411{
2412 int kind;
2413 void *data;
2414 Py_ssize_t len, targetlen;
2415 if (PyUnicode_READY(string) == -1)
2416 return NULL;
2417 kind = PyUnicode_KIND(string);
2418 data = PyUnicode_DATA(string);
2419 len = PyUnicode_GET_LENGTH(string);
2420 targetlen = len;
2421 if (copy_null)
2422 targetlen++;
2423 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002424 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002425 if (!target) {
2426 PyErr_NoMemory();
2427 return NULL;
2428 }
2429 }
2430 else {
2431 if (targetsize < targetlen) {
2432 PyErr_Format(PyExc_SystemError,
2433 "string is longer than the buffer");
2434 if (copy_null && 0 < targetsize)
2435 target[0] = 0;
2436 return NULL;
2437 }
2438 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002439 if (kind == PyUnicode_1BYTE_KIND) {
2440 Py_UCS1 *start = (Py_UCS1 *) data;
2441 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002442 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002443 else if (kind == PyUnicode_2BYTE_KIND) {
2444 Py_UCS2 *start = (Py_UCS2 *) data;
2445 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2446 }
2447 else {
2448 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002450 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 if (copy_null)
2452 target[len] = 0;
2453 return target;
2454}
2455
2456Py_UCS4*
2457PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2458 int copy_null)
2459{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002460 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 PyErr_BadInternalCall();
2462 return NULL;
2463 }
2464 return as_ucs4(string, target, targetsize, copy_null);
2465}
2466
2467Py_UCS4*
2468PyUnicode_AsUCS4Copy(PyObject *string)
2469{
2470 return as_ucs4(string, NULL, 0, 1);
2471}
2472
2473#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002474
Alexander Belopolsky40018472011-02-26 01:02:56 +00002475PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002476PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002477{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002480 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002481 PyErr_BadInternalCall();
2482 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483 }
2484
Martin v. Löwis790465f2008-04-05 20:41:37 +00002485 if (size == -1) {
2486 size = wcslen(w);
2487 }
2488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002489 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490}
2491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002492#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002493
Victor Stinner15a11362012-10-06 23:48:20 +02002494/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002495 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2496 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2497#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002498
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002499static int
2500unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2501 Py_ssize_t width, Py_ssize_t precision)
2502{
2503 Py_ssize_t length, fill, arglen;
2504 Py_UCS4 maxchar;
2505
2506 if (PyUnicode_READY(str) == -1)
2507 return -1;
2508
2509 length = PyUnicode_GET_LENGTH(str);
2510 if ((precision == -1 || precision >= length)
2511 && width <= length)
2512 return _PyUnicodeWriter_WriteStr(writer, str);
2513
2514 if (precision != -1)
2515 length = Py_MIN(precision, length);
2516
2517 arglen = Py_MAX(length, width);
2518 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2519 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2520 else
2521 maxchar = writer->maxchar;
2522
2523 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2524 return -1;
2525
2526 if (width > length) {
2527 fill = width - length;
2528 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2529 return -1;
2530 writer->pos += fill;
2531 }
2532
2533 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2534 str, 0, length);
2535 writer->pos += length;
2536 return 0;
2537}
2538
2539static int
2540unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2541 Py_ssize_t width, Py_ssize_t precision)
2542{
2543 /* UTF-8 */
2544 Py_ssize_t length;
2545 PyObject *unicode;
2546 int res;
2547
2548 length = strlen(str);
2549 if (precision != -1)
2550 length = Py_MIN(length, precision);
2551 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2552 if (unicode == NULL)
2553 return -1;
2554
2555 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2556 Py_DECREF(unicode);
2557 return res;
2558}
2559
Victor Stinner96865452011-03-01 23:44:09 +00002560static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002561unicode_fromformat_arg(_PyUnicodeWriter *writer,
2562 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002563{
Victor Stinnere215d962012-10-06 23:03:36 +02002564 const char *p;
2565 Py_ssize_t len;
2566 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002567 Py_ssize_t width;
2568 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002569 int longflag;
2570 int longlongflag;
2571 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002572 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002573
2574 p = f;
2575 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002576 zeropad = 0;
2577 if (*f == '0') {
2578 zeropad = 1;
2579 f++;
2580 }
Victor Stinner96865452011-03-01 23:44:09 +00002581
2582 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002583 width = -1;
2584 if (Py_ISDIGIT((unsigned)*f)) {
2585 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002586 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002587 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002588 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002589 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002591 return NULL;
2592 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002593 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002594 f++;
2595 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002596 }
2597 precision = -1;
2598 if (*f == '.') {
2599 f++;
2600 if (Py_ISDIGIT((unsigned)*f)) {
2601 precision = (*f - '0');
2602 f++;
2603 while (Py_ISDIGIT((unsigned)*f)) {
2604 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2605 PyErr_SetString(PyExc_ValueError,
2606 "precision too big");
2607 return NULL;
2608 }
2609 precision = (precision * 10) + (*f - '0');
2610 f++;
2611 }
2612 }
Victor Stinner96865452011-03-01 23:44:09 +00002613 if (*f == '%') {
2614 /* "%.3%s" => f points to "3" */
2615 f--;
2616 }
2617 }
2618 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002619 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002620 f--;
2621 }
Victor Stinner96865452011-03-01 23:44:09 +00002622
2623 /* Handle %ld, %lu, %lld and %llu. */
2624 longflag = 0;
2625 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002626 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002627 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002628 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002629 longflag = 1;
2630 ++f;
2631 }
2632#ifdef HAVE_LONG_LONG
2633 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002634 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002635 longlongflag = 1;
2636 f += 2;
2637 }
2638#endif
2639 }
2640 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002641 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002642 size_tflag = 1;
2643 ++f;
2644 }
Victor Stinnere215d962012-10-06 23:03:36 +02002645
2646 if (f[1] == '\0')
2647 writer->overallocate = 0;
2648
2649 switch (*f) {
2650 case 'c':
2651 {
2652 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002653 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002654 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002655 "character argument not in range(0x110000)");
2656 return NULL;
2657 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002658 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002659 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002660 break;
2661 }
2662
2663 case 'i':
2664 case 'd':
2665 case 'u':
2666 case 'x':
2667 {
2668 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002669 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002670 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002671
2672 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002673 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002674 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002675 va_arg(*vargs, unsigned long));
2676#ifdef HAVE_LONG_LONG
2677 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002678 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002679 va_arg(*vargs, unsigned PY_LONG_LONG));
2680#endif
2681 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002682 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002683 va_arg(*vargs, size_t));
2684 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002685 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002686 va_arg(*vargs, unsigned int));
2687 }
2688 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002689 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002690 }
2691 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002692 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002693 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002694 va_arg(*vargs, long));
2695#ifdef HAVE_LONG_LONG
2696 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002697 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002698 va_arg(*vargs, PY_LONG_LONG));
2699#endif
2700 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002701 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002702 va_arg(*vargs, Py_ssize_t));
2703 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002704 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002705 va_arg(*vargs, int));
2706 }
2707 assert(len >= 0);
2708
Victor Stinnere215d962012-10-06 23:03:36 +02002709 if (precision < len)
2710 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002711
2712 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002713 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2714 return NULL;
2715
Victor Stinnere215d962012-10-06 23:03:36 +02002716 if (width > precision) {
2717 Py_UCS4 fillchar;
2718 fill = width - precision;
2719 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002720 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2721 return NULL;
2722 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002723 }
Victor Stinner15a11362012-10-06 23:48:20 +02002724 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002725 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002726 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2727 return NULL;
2728 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002729 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002730
Victor Stinner4a587072013-11-19 12:54:53 +01002731 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2732 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002733 break;
2734 }
2735
2736 case 'p':
2737 {
2738 char number[MAX_LONG_LONG_CHARS];
2739
2740 len = sprintf(number, "%p", va_arg(*vargs, void*));
2741 assert(len >= 0);
2742
2743 /* %p is ill-defined: ensure leading 0x. */
2744 if (number[1] == 'X')
2745 number[1] = 'x';
2746 else if (number[1] != 'x') {
2747 memmove(number + 2, number,
2748 strlen(number) + 1);
2749 number[0] = '0';
2750 number[1] = 'x';
2751 len += 2;
2752 }
2753
Victor Stinner4a587072013-11-19 12:54:53 +01002754 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002755 return NULL;
2756 break;
2757 }
2758
2759 case 's':
2760 {
2761 /* UTF-8 */
2762 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002763 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002764 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002765 break;
2766 }
2767
2768 case 'U':
2769 {
2770 PyObject *obj = va_arg(*vargs, PyObject *);
2771 assert(obj && _PyUnicode_CHECK(obj));
2772
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002773 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002774 return NULL;
2775 break;
2776 }
2777
2778 case 'V':
2779 {
2780 PyObject *obj = va_arg(*vargs, PyObject *);
2781 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002782 if (obj) {
2783 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002784 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002785 return NULL;
2786 }
2787 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002788 assert(str != NULL);
2789 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002790 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002791 }
2792 break;
2793 }
2794
2795 case 'S':
2796 {
2797 PyObject *obj = va_arg(*vargs, PyObject *);
2798 PyObject *str;
2799 assert(obj);
2800 str = PyObject_Str(obj);
2801 if (!str)
2802 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002803 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002804 Py_DECREF(str);
2805 return NULL;
2806 }
2807 Py_DECREF(str);
2808 break;
2809 }
2810
2811 case 'R':
2812 {
2813 PyObject *obj = va_arg(*vargs, PyObject *);
2814 PyObject *repr;
2815 assert(obj);
2816 repr = PyObject_Repr(obj);
2817 if (!repr)
2818 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002819 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002820 Py_DECREF(repr);
2821 return NULL;
2822 }
2823 Py_DECREF(repr);
2824 break;
2825 }
2826
2827 case 'A':
2828 {
2829 PyObject *obj = va_arg(*vargs, PyObject *);
2830 PyObject *ascii;
2831 assert(obj);
2832 ascii = PyObject_ASCII(obj);
2833 if (!ascii)
2834 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002835 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002836 Py_DECREF(ascii);
2837 return NULL;
2838 }
2839 Py_DECREF(ascii);
2840 break;
2841 }
2842
2843 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002844 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002845 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002846 break;
2847
2848 default:
2849 /* if we stumble upon an unknown formatting code, copy the rest
2850 of the format string to the output string. (we cannot just
2851 skip the code, since there's no way to know what's in the
2852 argument list) */
2853 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002854 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002855 return NULL;
2856 f = p+len;
2857 return f;
2858 }
2859
2860 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002861 return f;
2862}
2863
Walter Dörwaldd2034312007-05-18 16:29:38 +00002864PyObject *
2865PyUnicode_FromFormatV(const char *format, va_list vargs)
2866{
Victor Stinnere215d962012-10-06 23:03:36 +02002867 va_list vargs2;
2868 const char *f;
2869 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002870
Victor Stinner8f674cc2013-04-17 23:02:17 +02002871 _PyUnicodeWriter_Init(&writer);
2872 writer.min_length = strlen(format) + 100;
2873 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002874
2875 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2876 Copy it to be able to pass a reference to a subfunction. */
2877 Py_VA_COPY(vargs2, vargs);
2878
2879 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002880 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002881 f = unicode_fromformat_arg(&writer, f, &vargs2);
2882 if (f == NULL)
2883 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002884 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002885 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002886 const char *p;
2887 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002888
Victor Stinnere215d962012-10-06 23:03:36 +02002889 p = f;
2890 do
2891 {
2892 if ((unsigned char)*p > 127) {
2893 PyErr_Format(PyExc_ValueError,
2894 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2895 "string, got a non-ASCII byte: 0x%02x",
2896 (unsigned char)*p);
2897 return NULL;
2898 }
2899 p++;
2900 }
2901 while (*p != '\0' && *p != '%');
2902 len = p - f;
2903
2904 if (*p == '\0')
2905 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002906
2907 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002908 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002909
2910 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002911 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002912 }
Victor Stinnere215d962012-10-06 23:03:36 +02002913 return _PyUnicodeWriter_Finish(&writer);
2914
2915 fail:
2916 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002917 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002918}
2919
Walter Dörwaldd2034312007-05-18 16:29:38 +00002920PyObject *
2921PyUnicode_FromFormat(const char *format, ...)
2922{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002923 PyObject* ret;
2924 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002925
2926#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002927 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002928#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002929 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002930#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002931 ret = PyUnicode_FromFormatV(format, vargs);
2932 va_end(vargs);
2933 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002934}
2935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002936#ifdef HAVE_WCHAR_H
2937
Victor Stinner5593d8a2010-10-02 11:11:27 +00002938/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2939 convert a Unicode object to a wide character string.
2940
Victor Stinnerd88d9832011-09-06 02:00:05 +02002941 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002942 character) required to convert the unicode object. Ignore size argument.
2943
Victor Stinnerd88d9832011-09-06 02:00:05 +02002944 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002945 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002946 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002947static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002948unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002949 wchar_t *w,
2950 Py_ssize_t size)
2951{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002952 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002953 const wchar_t *wstr;
2954
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002955 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002956 if (wstr == NULL)
2957 return -1;
2958
Victor Stinner5593d8a2010-10-02 11:11:27 +00002959 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960 if (size > res)
2961 size = res + 1;
2962 else
2963 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002964 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002965 return res;
2966 }
2967 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002968 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002969}
2970
2971Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002972PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002973 wchar_t *w,
2974 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975{
2976 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002977 PyErr_BadInternalCall();
2978 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002980 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981}
2982
Victor Stinner137c34c2010-09-29 10:25:54 +00002983wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002984PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002985 Py_ssize_t *size)
2986{
2987 wchar_t* buffer;
2988 Py_ssize_t buflen;
2989
2990 if (unicode == NULL) {
2991 PyErr_BadInternalCall();
2992 return NULL;
2993 }
2994
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002995 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002996 if (buflen == -1)
2997 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002998 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00002999 if (buffer == NULL) {
3000 PyErr_NoMemory();
3001 return NULL;
3002 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003003 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003004 if (buflen == -1) {
3005 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003006 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003007 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003008 if (size != NULL)
3009 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003010 return buffer;
3011}
3012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003013#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014
Alexander Belopolsky40018472011-02-26 01:02:56 +00003015PyObject *
3016PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003017{
Victor Stinner8faf8212011-12-08 22:14:11 +01003018 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003019 PyErr_SetString(PyExc_ValueError,
3020 "chr() arg not in range(0x110000)");
3021 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003022 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003023
Victor Stinner985a82a2014-01-03 12:53:47 +01003024 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003025}
3026
Alexander Belopolsky40018472011-02-26 01:02:56 +00003027PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003028PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003030 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003031 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003032 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003033 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003034 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003035 Py_INCREF(obj);
3036 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003037 }
3038 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003039 /* For a Unicode subtype that's not a Unicode object,
3040 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003041 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003042 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003043 PyErr_Format(PyExc_TypeError,
3044 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003045 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003046 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003047}
3048
Alexander Belopolsky40018472011-02-26 01:02:56 +00003049PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003050PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003051 const char *encoding,
3052 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003053{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003054 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003055 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003056
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003058 PyErr_BadInternalCall();
3059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003061
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003062 /* Decoding bytes objects is the most common case and should be fast */
3063 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003064 if (PyBytes_GET_SIZE(obj) == 0)
3065 _Py_RETURN_UNICODE_EMPTY();
3066 v = PyUnicode_Decode(
3067 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3068 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003069 return v;
3070 }
3071
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003072 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003073 PyErr_SetString(PyExc_TypeError,
3074 "decoding str is not supported");
3075 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003076 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003077
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003078 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3079 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3080 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003081 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003082 Py_TYPE(obj)->tp_name);
3083 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003084 }
Tim Petersced69f82003-09-16 20:30:58 +00003085
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003086 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003087 PyBuffer_Release(&buffer);
3088 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003090
Serhiy Storchaka05997252013-01-26 12:14:02 +02003091 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003092 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003093 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003094}
3095
Victor Stinner600d3be2010-06-10 12:00:55 +00003096/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003097 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3098 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003099int
3100_Py_normalize_encoding(const char *encoding,
3101 char *lower,
3102 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003104 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003105 char *l;
3106 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003107
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003108 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01003109 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01003110 if (lower_len < 6)
3111 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003112 strcpy(lower, "utf-8");
3113 return 1;
3114 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003115 e = encoding;
3116 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003117 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003118 while (*e) {
3119 if (l == l_end)
3120 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003121 if (Py_ISUPPER(*e)) {
3122 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003123 }
3124 else if (*e == '_') {
3125 *l++ = '-';
3126 e++;
3127 }
3128 else {
3129 *l++ = *e++;
3130 }
3131 }
3132 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003133 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003134}
3135
Alexander Belopolsky40018472011-02-26 01:02:56 +00003136PyObject *
3137PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003138 Py_ssize_t size,
3139 const char *encoding,
3140 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003141{
3142 PyObject *buffer = NULL, *unicode;
3143 Py_buffer info;
3144 char lower[11]; /* Enough for any encoding shortcut */
3145
Fred Drakee4315f52000-05-09 19:53:39 +00003146 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003147 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003148 if ((strcmp(lower, "utf-8") == 0) ||
3149 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003150 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003151 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003152 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003153 (strcmp(lower, "iso-8859-1") == 0) ||
3154 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003155 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003156#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003157 else if (strcmp(lower, "mbcs") == 0)
3158 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003159#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003160 else if (strcmp(lower, "ascii") == 0)
3161 return PyUnicode_DecodeASCII(s, size, errors);
3162 else if (strcmp(lower, "utf-16") == 0)
3163 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3164 else if (strcmp(lower, "utf-32") == 0)
3165 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167
3168 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003169 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003170 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003171 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003172 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 if (buffer == NULL)
3174 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003175 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 if (unicode == NULL)
3177 goto onError;
3178 if (!PyUnicode_Check(unicode)) {
3179 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003180 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3181 "use codecs.decode() to decode to arbitrary types",
3182 encoding,
3183 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003184 Py_DECREF(unicode);
3185 goto onError;
3186 }
3187 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003188 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003189
Benjamin Peterson29060642009-01-31 22:14:21 +00003190 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191 Py_XDECREF(buffer);
3192 return NULL;
3193}
3194
Alexander Belopolsky40018472011-02-26 01:02:56 +00003195PyObject *
3196PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003197 const char *encoding,
3198 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003199{
3200 PyObject *v;
3201
3202 if (!PyUnicode_Check(unicode)) {
3203 PyErr_BadArgument();
3204 goto onError;
3205 }
3206
3207 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003208 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003209
3210 /* Decode via the codec registry */
3211 v = PyCodec_Decode(unicode, encoding, errors);
3212 if (v == NULL)
3213 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003214 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003215
Benjamin Peterson29060642009-01-31 22:14:21 +00003216 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003217 return NULL;
3218}
3219
Alexander Belopolsky40018472011-02-26 01:02:56 +00003220PyObject *
3221PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003222 const char *encoding,
3223 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003224{
3225 PyObject *v;
3226
3227 if (!PyUnicode_Check(unicode)) {
3228 PyErr_BadArgument();
3229 goto onError;
3230 }
3231
3232 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003234
3235 /* Decode via the codec registry */
3236 v = PyCodec_Decode(unicode, encoding, errors);
3237 if (v == NULL)
3238 goto onError;
3239 if (!PyUnicode_Check(v)) {
3240 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003241 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3242 "use codecs.decode() to decode to arbitrary types",
3243 encoding,
3244 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003245 Py_DECREF(v);
3246 goto onError;
3247 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003248 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003249
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003251 return NULL;
3252}
3253
Alexander Belopolsky40018472011-02-26 01:02:56 +00003254PyObject *
3255PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003256 Py_ssize_t size,
3257 const char *encoding,
3258 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259{
3260 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003261
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 unicode = PyUnicode_FromUnicode(s, size);
3263 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003264 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3266 Py_DECREF(unicode);
3267 return v;
3268}
3269
Alexander Belopolsky40018472011-02-26 01:02:56 +00003270PyObject *
3271PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003272 const char *encoding,
3273 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003274{
3275 PyObject *v;
3276
3277 if (!PyUnicode_Check(unicode)) {
3278 PyErr_BadArgument();
3279 goto onError;
3280 }
3281
3282 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003283 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003284
3285 /* Encode via the codec registry */
3286 v = PyCodec_Encode(unicode, encoding, errors);
3287 if (v == NULL)
3288 goto onError;
3289 return v;
3290
Benjamin Peterson29060642009-01-31 22:14:21 +00003291 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003292 return NULL;
3293}
3294
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003295static size_t
3296wcstombs_errorpos(const wchar_t *wstr)
3297{
3298 size_t len;
3299#if SIZEOF_WCHAR_T == 2
3300 wchar_t buf[3];
3301#else
3302 wchar_t buf[2];
3303#endif
3304 char outbuf[MB_LEN_MAX];
3305 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003306
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003307#if SIZEOF_WCHAR_T == 2
3308 buf[2] = 0;
3309#else
3310 buf[1] = 0;
3311#endif
3312 start = wstr;
3313 while (*wstr != L'\0')
3314 {
3315 previous = wstr;
3316#if SIZEOF_WCHAR_T == 2
3317 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3318 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3319 {
3320 buf[0] = wstr[0];
3321 buf[1] = wstr[1];
3322 wstr += 2;
3323 }
3324 else {
3325 buf[0] = *wstr;
3326 buf[1] = 0;
3327 wstr++;
3328 }
3329#else
3330 buf[0] = *wstr;
3331 wstr++;
3332#endif
3333 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003334 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003335 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003336 }
3337
3338 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003339 return 0;
3340}
3341
Victor Stinner1b579672011-12-17 05:47:23 +01003342static int
3343locale_error_handler(const char *errors, int *surrogateescape)
3344{
Victor Stinner50149202015-09-22 00:26:54 +02003345 _Py_error_handler error_handler = get_error_handler(errors);
3346 switch (error_handler)
3347 {
3348 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003349 *surrogateescape = 0;
3350 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003351 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003352 *surrogateescape = 1;
3353 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003354 default:
3355 PyErr_Format(PyExc_ValueError,
3356 "only 'strict' and 'surrogateescape' error handlers "
3357 "are supported, not '%s'",
3358 errors);
3359 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003360 }
Victor Stinner1b579672011-12-17 05:47:23 +01003361}
3362
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003363PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003364PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003365{
3366 Py_ssize_t wlen, wlen2;
3367 wchar_t *wstr;
3368 PyObject *bytes = NULL;
3369 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003370 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003371 PyObject *exc;
3372 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003373 int surrogateescape;
3374
3375 if (locale_error_handler(errors, &surrogateescape) < 0)
3376 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003377
3378 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3379 if (wstr == NULL)
3380 return NULL;
3381
3382 wlen2 = wcslen(wstr);
3383 if (wlen2 != wlen) {
3384 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003385 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003386 return NULL;
3387 }
3388
3389 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003390 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003391 char *str;
3392
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003393 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003394 if (str == NULL) {
3395 if (error_pos == (size_t)-1) {
3396 PyErr_NoMemory();
3397 PyMem_Free(wstr);
3398 return NULL;
3399 }
3400 else {
3401 goto encode_error;
3402 }
3403 }
3404 PyMem_Free(wstr);
3405
3406 bytes = PyBytes_FromString(str);
3407 PyMem_Free(str);
3408 }
3409 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003410 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003411 size_t len, len2;
3412
3413 len = wcstombs(NULL, wstr, 0);
3414 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003415 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003416 goto encode_error;
3417 }
3418
3419 bytes = PyBytes_FromStringAndSize(NULL, len);
3420 if (bytes == NULL) {
3421 PyMem_Free(wstr);
3422 return NULL;
3423 }
3424
3425 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3426 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003427 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003428 goto encode_error;
3429 }
3430 PyMem_Free(wstr);
3431 }
3432 return bytes;
3433
3434encode_error:
3435 errmsg = strerror(errno);
3436 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003437
3438 if (error_pos == (size_t)-1)
3439 error_pos = wcstombs_errorpos(wstr);
3440
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003441 PyMem_Free(wstr);
3442 Py_XDECREF(bytes);
3443
Victor Stinner2f197072011-12-17 07:08:30 +01003444 if (errmsg != NULL) {
3445 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003446 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003447 if (wstr != NULL) {
3448 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003449 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003450 } else
3451 errmsg = NULL;
3452 }
3453 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003454 reason = PyUnicode_FromString(
3455 "wcstombs() encountered an unencodable "
3456 "wide character");
3457 if (reason == NULL)
3458 return NULL;
3459
3460 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3461 "locale", unicode,
3462 (Py_ssize_t)error_pos,
3463 (Py_ssize_t)(error_pos+1),
3464 reason);
3465 Py_DECREF(reason);
3466 if (exc != NULL) {
3467 PyCodec_StrictErrors(exc);
3468 Py_XDECREF(exc);
3469 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003470 return NULL;
3471}
3472
Victor Stinnerad158722010-10-27 00:25:46 +00003473PyObject *
3474PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003475{
Victor Stinner99b95382011-07-04 14:23:54 +02003476#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003477 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003478#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003479 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003480#else
Victor Stinner793b5312011-04-27 00:24:21 +02003481 PyInterpreterState *interp = PyThreadState_GET()->interp;
3482 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3483 cannot use it to encode and decode filenames before it is loaded. Load
3484 the Python codec requires to encode at least its own filename. Use the C
3485 version of the locale codec until the codec registry is initialized and
3486 the Python codec is loaded.
3487
3488 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3489 cannot only rely on it: check also interp->fscodec_initialized for
3490 subinterpreters. */
3491 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003492 return PyUnicode_AsEncodedString(unicode,
3493 Py_FileSystemDefaultEncoding,
3494 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003495 }
3496 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003497 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003498 }
Victor Stinnerad158722010-10-27 00:25:46 +00003499#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003500}
3501
Alexander Belopolsky40018472011-02-26 01:02:56 +00003502PyObject *
3503PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003504 const char *encoding,
3505 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003506{
3507 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003508 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003509
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510 if (!PyUnicode_Check(unicode)) {
3511 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003512 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513 }
Fred Drakee4315f52000-05-09 19:53:39 +00003514
Fred Drakee4315f52000-05-09 19:53:39 +00003515 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003516 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003517 if ((strcmp(lower, "utf-8") == 0) ||
3518 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003519 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003520 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003521 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003522 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003523 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003524 }
Victor Stinner37296e82010-06-10 13:36:23 +00003525 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003526 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003527 (strcmp(lower, "iso-8859-1") == 0) ||
3528 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003529 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003530#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003531 else if (strcmp(lower, "mbcs") == 0)
3532 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003533#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003534 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003535 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537
3538 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003539 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003541 return NULL;
3542
3543 /* The normal path */
3544 if (PyBytes_Check(v))
3545 return v;
3546
3547 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003548 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003549 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003550 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003551
3552 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003553 "encoder %s returned bytearray instead of bytes; "
3554 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003555 encoding);
3556 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003557 Py_DECREF(v);
3558 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003559 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003560
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003561 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3562 Py_DECREF(v);
3563 return b;
3564 }
3565
3566 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003567 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3568 "use codecs.encode() to encode to arbitrary types",
3569 encoding,
3570 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003571 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003572 return NULL;
3573}
3574
Alexander Belopolsky40018472011-02-26 01:02:56 +00003575PyObject *
3576PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003577 const char *encoding,
3578 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003579{
3580 PyObject *v;
3581
3582 if (!PyUnicode_Check(unicode)) {
3583 PyErr_BadArgument();
3584 goto onError;
3585 }
3586
3587 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003588 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003589
3590 /* Encode via the codec registry */
3591 v = PyCodec_Encode(unicode, encoding, errors);
3592 if (v == NULL)
3593 goto onError;
3594 if (!PyUnicode_Check(v)) {
3595 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003596 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3597 "use codecs.encode() to encode to arbitrary types",
3598 encoding,
3599 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003600 Py_DECREF(v);
3601 goto onError;
3602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003604
Benjamin Peterson29060642009-01-31 22:14:21 +00003605 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003606 return NULL;
3607}
3608
Victor Stinner2f197072011-12-17 07:08:30 +01003609static size_t
3610mbstowcs_errorpos(const char *str, size_t len)
3611{
3612#ifdef HAVE_MBRTOWC
3613 const char *start = str;
3614 mbstate_t mbs;
3615 size_t converted;
3616 wchar_t ch;
3617
3618 memset(&mbs, 0, sizeof mbs);
3619 while (len)
3620 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003621 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003622 if (converted == 0)
3623 /* Reached end of string */
3624 break;
3625 if (converted == (size_t)-1 || converted == (size_t)-2) {
3626 /* Conversion error or incomplete character */
3627 return str - start;
3628 }
3629 else {
3630 str += converted;
3631 len -= converted;
3632 }
3633 }
3634 /* failed to find the undecodable byte sequence */
3635 return 0;
3636#endif
3637 return 0;
3638}
3639
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003640PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003641PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003642 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003643{
3644 wchar_t smallbuf[256];
3645 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3646 wchar_t *wstr;
3647 size_t wlen, wlen2;
3648 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003649 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003650 size_t error_pos;
3651 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003652 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3653 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003654
3655 if (locale_error_handler(errors, &surrogateescape) < 0)
3656 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003657
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003658 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3659 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003660 return NULL;
3661 }
3662
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003663 if (surrogateescape) {
3664 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003665 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003666 if (wstr == NULL) {
3667 if (wlen == (size_t)-1)
3668 PyErr_NoMemory();
3669 else
3670 PyErr_SetFromErrno(PyExc_OSError);
3671 return NULL;
3672 }
3673
3674 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003675 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003676 }
3677 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003678 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003679#ifndef HAVE_BROKEN_MBSTOWCS
3680 wlen = mbstowcs(NULL, str, 0);
3681#else
3682 wlen = len;
3683#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003684 if (wlen == (size_t)-1)
3685 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003686 if (wlen+1 <= smallbuf_len) {
3687 wstr = smallbuf;
3688 }
3689 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003690 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003691 if (!wstr)
3692 return PyErr_NoMemory();
3693 }
3694
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003695 wlen2 = mbstowcs(wstr, str, wlen+1);
3696 if (wlen2 == (size_t)-1) {
3697 if (wstr != smallbuf)
3698 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003699 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003700 }
3701#ifdef HAVE_BROKEN_MBSTOWCS
3702 assert(wlen2 == wlen);
3703#endif
3704 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3705 if (wstr != smallbuf)
3706 PyMem_Free(wstr);
3707 }
3708 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003709
3710decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003711 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003712 errmsg = strerror(errno);
3713 assert(errmsg != NULL);
3714
3715 error_pos = mbstowcs_errorpos(str, len);
3716 if (errmsg != NULL) {
3717 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003718 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003719 if (wstr != NULL) {
3720 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003721 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003722 }
Victor Stinner2f197072011-12-17 07:08:30 +01003723 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003724 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003725 reason = PyUnicode_FromString(
3726 "mbstowcs() encountered an invalid multibyte sequence");
3727 if (reason == NULL)
3728 return NULL;
3729
3730 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3731 "locale", str, len,
3732 (Py_ssize_t)error_pos,
3733 (Py_ssize_t)(error_pos+1),
3734 reason);
3735 Py_DECREF(reason);
3736 if (exc != NULL) {
3737 PyCodec_StrictErrors(exc);
3738 Py_XDECREF(exc);
3739 }
3740 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003741}
3742
3743PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003744PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003745{
3746 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003747 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003748}
3749
3750
3751PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003752PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003753 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003754 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3755}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003756
Christian Heimes5894ba72007-11-04 11:43:14 +00003757PyObject*
3758PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3759{
Victor Stinner99b95382011-07-04 14:23:54 +02003760#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003761 return PyUnicode_DecodeMBCS(s, size, NULL);
3762#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003763 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003764#else
Victor Stinner793b5312011-04-27 00:24:21 +02003765 PyInterpreterState *interp = PyThreadState_GET()->interp;
3766 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3767 cannot use it to encode and decode filenames before it is loaded. Load
3768 the Python codec requires to encode at least its own filename. Use the C
3769 version of the locale codec until the codec registry is initialized and
3770 the Python codec is loaded.
3771
3772 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3773 cannot only rely on it: check also interp->fscodec_initialized for
3774 subinterpreters. */
3775 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003776 return PyUnicode_Decode(s, size,
3777 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003778 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003779 }
3780 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003781 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003782 }
Victor Stinnerad158722010-10-27 00:25:46 +00003783#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003784}
3785
Martin v. Löwis011e8422009-05-05 04:43:17 +00003786
3787int
3788PyUnicode_FSConverter(PyObject* arg, void* addr)
3789{
3790 PyObject *output = NULL;
3791 Py_ssize_t size;
3792 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003793 if (arg == NULL) {
3794 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003795 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003796 return 1;
3797 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003798 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003799 output = arg;
3800 Py_INCREF(output);
3801 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003802 else if (PyUnicode_Check(arg)) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003803 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003804 if (!output)
3805 return 0;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003806 assert(PyBytes_Check(output));
3807 }
3808 else {
3809 PyErr_Format(PyExc_TypeError,
3810 "must be str or bytes, not %.100s",
3811 Py_TYPE(arg)->tp_name);
3812 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003813 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003814 size = PyBytes_GET_SIZE(output);
3815 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003816 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003817 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003818 Py_DECREF(output);
3819 return 0;
3820 }
3821 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003822 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003823}
3824
3825
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003826int
3827PyUnicode_FSDecoder(PyObject* arg, void* addr)
3828{
3829 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003830 if (arg == NULL) {
3831 Py_DECREF(*(PyObject**)addr);
3832 return 1;
3833 }
3834 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003835 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003837 output = arg;
3838 Py_INCREF(output);
3839 }
3840 else {
3841 arg = PyBytes_FromObject(arg);
3842 if (!arg)
3843 return 0;
3844 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3845 PyBytes_GET_SIZE(arg));
3846 Py_DECREF(arg);
3847 if (!output)
3848 return 0;
3849 if (!PyUnicode_Check(output)) {
3850 Py_DECREF(output);
3851 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3852 return 0;
3853 }
3854 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003855 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003856 Py_DECREF(output);
3857 return 0;
3858 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003860 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003861 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003862 Py_DECREF(output);
3863 return 0;
3864 }
3865 *(PyObject**)addr = output;
3866 return Py_CLEANUP_SUPPORTED;
3867}
3868
3869
Martin v. Löwis5b222132007-06-10 09:51:05 +00003870char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003872{
Christian Heimesf3863112007-11-22 07:46:41 +00003873 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003875 if (!PyUnicode_Check(unicode)) {
3876 PyErr_BadArgument();
3877 return NULL;
3878 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003879 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003880 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003882 if (PyUnicode_UTF8(unicode) == NULL) {
3883 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003884 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885 if (bytes == NULL)
3886 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003887 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3888 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003889 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890 Py_DECREF(bytes);
3891 return NULL;
3892 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003893 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3894 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3895 PyBytes_AS_STRING(bytes),
3896 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003897 Py_DECREF(bytes);
3898 }
3899
3900 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003901 *psize = PyUnicode_UTF8_LENGTH(unicode);
3902 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003903}
3904
3905char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003906PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003907{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3909}
3910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003911Py_UNICODE *
3912PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003914 const unsigned char *one_byte;
3915#if SIZEOF_WCHAR_T == 4
3916 const Py_UCS2 *two_bytes;
3917#else
3918 const Py_UCS4 *four_bytes;
3919 const Py_UCS4 *ucs4_end;
3920 Py_ssize_t num_surrogates;
3921#endif
3922 wchar_t *w;
3923 wchar_t *wchar_end;
3924
3925 if (!PyUnicode_Check(unicode)) {
3926 PyErr_BadArgument();
3927 return NULL;
3928 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003929 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003930 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003931 assert(_PyUnicode_KIND(unicode) != 0);
3932 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003934 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003935#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003936 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3937 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003938 num_surrogates = 0;
3939
3940 for (; four_bytes < ucs4_end; ++four_bytes) {
3941 if (*four_bytes > 0xFFFF)
3942 ++num_surrogates;
3943 }
3944
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003945 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3946 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3947 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 PyErr_NoMemory();
3949 return NULL;
3950 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003951 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003953 w = _PyUnicode_WSTR(unicode);
3954 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3955 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003956 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3957 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003958 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003959 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003960 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3961 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003962 }
3963 else
3964 *w = *four_bytes;
3965
3966 if (w > wchar_end) {
3967 assert(0 && "Miscalculated string end");
3968 }
3969 }
3970 *w = 0;
3971#else
3972 /* sizeof(wchar_t) == 4 */
3973 Py_FatalError("Impossible unicode object state, wstr and str "
3974 "should share memory already.");
3975 return NULL;
3976#endif
3977 }
3978 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003979 if ((size_t)_PyUnicode_LENGTH(unicode) >
3980 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3981 PyErr_NoMemory();
3982 return NULL;
3983 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003984 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3985 (_PyUnicode_LENGTH(unicode) + 1));
3986 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 PyErr_NoMemory();
3988 return NULL;
3989 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003990 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3991 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3992 w = _PyUnicode_WSTR(unicode);
3993 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003994
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003995 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3996 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003997 for (; w < wchar_end; ++one_byte, ++w)
3998 *w = *one_byte;
3999 /* null-terminate the wstr */
4000 *w = 0;
4001 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004002 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004004 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005 for (; w < wchar_end; ++two_bytes, ++w)
4006 *w = *two_bytes;
4007 /* null-terminate the wstr */
4008 *w = 0;
4009#else
4010 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004011 PyObject_FREE(_PyUnicode_WSTR(unicode));
4012 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004013 Py_FatalError("Impossible unicode object state, wstr "
4014 "and str should share memory already.");
4015 return NULL;
4016#endif
4017 }
4018 else {
4019 assert(0 && "This should never happen.");
4020 }
4021 }
4022 }
4023 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004024 *size = PyUnicode_WSTR_LENGTH(unicode);
4025 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004026}
4027
Alexander Belopolsky40018472011-02-26 01:02:56 +00004028Py_UNICODE *
4029PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004031 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004032}
4033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004034
Alexander Belopolsky40018472011-02-26 01:02:56 +00004035Py_ssize_t
4036PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037{
4038 if (!PyUnicode_Check(unicode)) {
4039 PyErr_BadArgument();
4040 goto onError;
4041 }
4042 return PyUnicode_GET_SIZE(unicode);
4043
Benjamin Peterson29060642009-01-31 22:14:21 +00004044 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045 return -1;
4046}
4047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048Py_ssize_t
4049PyUnicode_GetLength(PyObject *unicode)
4050{
Victor Stinner07621332012-06-16 04:53:46 +02004051 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004052 PyErr_BadArgument();
4053 return -1;
4054 }
Victor Stinner07621332012-06-16 04:53:46 +02004055 if (PyUnicode_READY(unicode) == -1)
4056 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057 return PyUnicode_GET_LENGTH(unicode);
4058}
4059
4060Py_UCS4
4061PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4062{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004063 void *data;
4064 int kind;
4065
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004066 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4067 PyErr_BadArgument();
4068 return (Py_UCS4)-1;
4069 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004070 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004071 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004072 return (Py_UCS4)-1;
4073 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004074 data = PyUnicode_DATA(unicode);
4075 kind = PyUnicode_KIND(unicode);
4076 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004077}
4078
4079int
4080PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4081{
4082 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004083 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004084 return -1;
4085 }
Victor Stinner488fa492011-12-12 00:01:39 +01004086 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004087 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004088 PyErr_SetString(PyExc_IndexError, "string index out of range");
4089 return -1;
4090 }
Victor Stinner488fa492011-12-12 00:01:39 +01004091 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004092 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004093 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4094 PyErr_SetString(PyExc_ValueError, "character out of range");
4095 return -1;
4096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4098 index, ch);
4099 return 0;
4100}
4101
Alexander Belopolsky40018472011-02-26 01:02:56 +00004102const char *
4103PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004104{
Victor Stinner42cb4622010-09-01 19:39:01 +00004105 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004106}
4107
Victor Stinner554f3f02010-06-16 23:33:54 +00004108/* create or adjust a UnicodeDecodeError */
4109static void
4110make_decode_exception(PyObject **exceptionObject,
4111 const char *encoding,
4112 const char *input, Py_ssize_t length,
4113 Py_ssize_t startpos, Py_ssize_t endpos,
4114 const char *reason)
4115{
4116 if (*exceptionObject == NULL) {
4117 *exceptionObject = PyUnicodeDecodeError_Create(
4118 encoding, input, length, startpos, endpos, reason);
4119 }
4120 else {
4121 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4122 goto onError;
4123 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4124 goto onError;
4125 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4126 goto onError;
4127 }
4128 return;
4129
4130onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004131 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004132}
4133
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004134#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135/* error handling callback helper:
4136 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004137 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 and adjust various state variables.
4139 return 0 on success, -1 on error
4140*/
4141
Alexander Belopolsky40018472011-02-26 01:02:56 +00004142static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004143unicode_decode_call_errorhandler_wchar(
4144 const char *errors, PyObject **errorHandler,
4145 const char *encoding, const char *reason,
4146 const char **input, const char **inend, Py_ssize_t *startinpos,
4147 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4148 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004150 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004151
4152 PyObject *restuple = NULL;
4153 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004154 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004155 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004156 Py_ssize_t requiredsize;
4157 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004158 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004159 wchar_t *repwstr;
4160 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004162 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4163 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004164
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 *errorHandler = PyCodec_LookupError(errors);
4167 if (*errorHandler == NULL)
4168 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004169 }
4170
Victor Stinner554f3f02010-06-16 23:33:54 +00004171 make_decode_exception(exceptionObject,
4172 encoding,
4173 *input, *inend - *input,
4174 *startinpos, *endinpos,
4175 reason);
4176 if (*exceptionObject == NULL)
4177 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178
4179 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4180 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004181 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004183 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004184 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 }
4186 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004187 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004188
4189 /* Copy back the bytes variables, which might have been modified by the
4190 callback */
4191 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4192 if (!inputobj)
4193 goto onError;
4194 if (!PyBytes_Check(inputobj)) {
4195 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4196 }
4197 *input = PyBytes_AS_STRING(inputobj);
4198 insize = PyBytes_GET_SIZE(inputobj);
4199 *inend = *input + insize;
4200 /* we can DECREF safely, as the exception has another reference,
4201 so the object won't go away. */
4202 Py_DECREF(inputobj);
4203
4204 if (newpos<0)
4205 newpos = insize+newpos;
4206 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004207 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004208 goto onError;
4209 }
4210
4211 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4212 if (repwstr == NULL)
4213 goto onError;
4214 /* need more space? (at least enough for what we
4215 have+the replacement+the rest of the string (starting
4216 at the new input position), so we won't have to check space
4217 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004218 requiredsize = *outpos;
4219 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4220 goto overflow;
4221 requiredsize += repwlen;
4222 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4223 goto overflow;
4224 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004225 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004226 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004227 requiredsize = 2*outsize;
4228 if (unicode_resize(output, requiredsize) < 0)
4229 goto onError;
4230 }
4231 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4232 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004233 *endinpos = newpos;
4234 *inptr = *input + newpos;
4235
4236 /* we made it! */
4237 Py_XDECREF(restuple);
4238 return 0;
4239
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004240 overflow:
4241 PyErr_SetString(PyExc_OverflowError,
4242 "decoded result is too long for a Python string");
4243
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004244 onError:
4245 Py_XDECREF(restuple);
4246 return -1;
4247}
4248#endif /* HAVE_MBCS */
4249
4250static int
4251unicode_decode_call_errorhandler_writer(
4252 const char *errors, PyObject **errorHandler,
4253 const char *encoding, const char *reason,
4254 const char **input, const char **inend, Py_ssize_t *startinpos,
4255 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4256 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4257{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004258 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004259
4260 PyObject *restuple = NULL;
4261 PyObject *repunicode = NULL;
4262 Py_ssize_t insize;
4263 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004264 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004265 PyObject *inputobj = NULL;
4266
4267 if (*errorHandler == NULL) {
4268 *errorHandler = PyCodec_LookupError(errors);
4269 if (*errorHandler == NULL)
4270 goto onError;
4271 }
4272
4273 make_decode_exception(exceptionObject,
4274 encoding,
4275 *input, *inend - *input,
4276 *startinpos, *endinpos,
4277 reason);
4278 if (*exceptionObject == NULL)
4279 goto onError;
4280
4281 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4282 if (restuple == NULL)
4283 goto onError;
4284 if (!PyTuple_Check(restuple)) {
4285 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4286 goto onError;
4287 }
4288 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004289 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004290
4291 /* Copy back the bytes variables, which might have been modified by the
4292 callback */
4293 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4294 if (!inputobj)
4295 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004296 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004297 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004298 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004299 *input = PyBytes_AS_STRING(inputobj);
4300 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004301 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004302 /* we can DECREF safely, as the exception has another reference,
4303 so the object won't go away. */
4304 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004305
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004306 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004307 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004308 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004309 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004310 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004311 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004312
Victor Stinner8f674cc2013-04-17 23:02:17 +02004313 if (PyUnicode_READY(repunicode) < 0)
4314 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004315 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004316 if (replen > 1) {
4317 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004318 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004319 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4320 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4321 goto onError;
4322 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004323 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004324 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004325
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004326 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004327 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004328
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004329 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004330 Py_XDECREF(restuple);
4331 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004332
Benjamin Peterson29060642009-01-31 22:14:21 +00004333 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004334 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004335 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336}
4337
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004338/* --- UTF-7 Codec -------------------------------------------------------- */
4339
Antoine Pitrou244651a2009-05-04 18:56:13 +00004340/* See RFC2152 for details. We encode conservatively and decode liberally. */
4341
4342/* Three simple macros defining base-64. */
4343
4344/* Is c a base-64 character? */
4345
4346#define IS_BASE64(c) \
4347 (((c) >= 'A' && (c) <= 'Z') || \
4348 ((c) >= 'a' && (c) <= 'z') || \
4349 ((c) >= '0' && (c) <= '9') || \
4350 (c) == '+' || (c) == '/')
4351
4352/* given that c is a base-64 character, what is its base-64 value? */
4353
4354#define FROM_BASE64(c) \
4355 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4356 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4357 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4358 (c) == '+' ? 62 : 63)
4359
4360/* What is the base-64 character of the bottom 6 bits of n? */
4361
4362#define TO_BASE64(n) \
4363 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4364
4365/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4366 * decoded as itself. We are permissive on decoding; the only ASCII
4367 * byte not decoding to itself is the + which begins a base64
4368 * string. */
4369
4370#define DECODE_DIRECT(c) \
4371 ((c) <= 127 && (c) != '+')
4372
4373/* The UTF-7 encoder treats ASCII characters differently according to
4374 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4375 * the above). See RFC2152. This array identifies these different
4376 * sets:
4377 * 0 : "Set D"
4378 * alphanumeric and '(),-./:?
4379 * 1 : "Set O"
4380 * !"#$%&*;<=>@[]^_`{|}
4381 * 2 : "whitespace"
4382 * ht nl cr sp
4383 * 3 : special (must be base64 encoded)
4384 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4385 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004386
Tim Petersced69f82003-09-16 20:30:58 +00004387static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004388char utf7_category[128] = {
4389/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4390 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4391/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4392 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4393/* sp ! " # $ % & ' ( ) * + , - . / */
4394 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4395/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4396 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4397/* @ A B C D E F G H I J K L M N O */
4398 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4399/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4400 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4401/* ` a b c d e f g h i j k l m n o */
4402 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4403/* p q r s t u v w x y z { | } ~ del */
4404 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405};
4406
Antoine Pitrou244651a2009-05-04 18:56:13 +00004407/* ENCODE_DIRECT: this character should be encoded as itself. The
4408 * answer depends on whether we are encoding set O as itself, and also
4409 * on whether we are encoding whitespace as itself. RFC2152 makes it
4410 * clear that the answers to these questions vary between
4411 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004412
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413#define ENCODE_DIRECT(c, directO, directWS) \
4414 ((c) < 128 && (c) > 0 && \
4415 ((utf7_category[(c)] == 0) || \
4416 (directWS && (utf7_category[(c)] == 2)) || \
4417 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004418
Alexander Belopolsky40018472011-02-26 01:02:56 +00004419PyObject *
4420PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004421 Py_ssize_t size,
4422 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004423{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004424 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4425}
4426
Antoine Pitrou244651a2009-05-04 18:56:13 +00004427/* The decoder. The only state we preserve is our read position,
4428 * i.e. how many characters we have consumed. So if we end in the
4429 * middle of a shift sequence we have to back off the read position
4430 * and the output to the beginning of the sequence, otherwise we lose
4431 * all the shift state (seen bits, number of bits seen, high
4432 * surrogate). */
4433
Alexander Belopolsky40018472011-02-26 01:02:56 +00004434PyObject *
4435PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004436 Py_ssize_t size,
4437 const char *errors,
4438 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004439{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004441 Py_ssize_t startinpos;
4442 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004443 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004444 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004445 const char *errmsg = "";
4446 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004447 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004448 unsigned int base64bits = 0;
4449 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004450 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 PyObject *errorHandler = NULL;
4452 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004453
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004454 if (size == 0) {
4455 if (consumed)
4456 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004457 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004458 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004459
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004460 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004461 _PyUnicodeWriter_Init(&writer);
4462 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004463
4464 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004465 e = s + size;
4466
4467 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004468 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004469 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004470 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004471
Antoine Pitrou244651a2009-05-04 18:56:13 +00004472 if (inShift) { /* in a base-64 section */
4473 if (IS_BASE64(ch)) { /* consume a base-64 character */
4474 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4475 base64bits += 6;
4476 s++;
4477 if (base64bits >= 16) {
4478 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004479 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004480 base64bits -= 16;
4481 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004482 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004483 if (surrogate) {
4484 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004485 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4486 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004487 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004488 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004489 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004490 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004491 }
4492 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004493 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004494 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004495 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004496 }
4497 }
Victor Stinner551ac952011-11-29 22:58:13 +01004498 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004499 /* first surrogate */
4500 surrogate = outCh;
4501 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004503 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004504 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004505 }
4506 }
4507 }
4508 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004509 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510 if (base64bits > 0) { /* left-over bits */
4511 if (base64bits >= 6) {
4512 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004513 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514 errmsg = "partial character in shift sequence";
4515 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004517 else {
4518 /* Some bits remain; they should be zero */
4519 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004520 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 errmsg = "non-zero padding bits in shift sequence";
4522 goto utf7Error;
4523 }
4524 }
4525 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004526 if (surrogate && DECODE_DIRECT(ch)) {
4527 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4528 goto onError;
4529 }
4530 surrogate = 0;
4531 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004532 /* '-' is absorbed; other terminating
4533 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004534 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536 }
4537 }
4538 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004539 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540 s++; /* consume '+' */
4541 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004543 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004544 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 }
4546 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004547 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004548 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004549 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004550 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004551 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004552 }
4553 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004554 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004556 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004557 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004558 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 else {
4560 startinpos = s-starts;
4561 s++;
4562 errmsg = "unexpected special character";
4563 goto utf7Error;
4564 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004565 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004567 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004568 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 errors, &errorHandler,
4570 "utf7", errmsg,
4571 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004572 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004573 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004574 }
4575
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 /* end of string */
4577
4578 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4579 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004580 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 if (surrogate ||
4582 (base64bits >= 6) ||
4583 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004585 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004586 errors, &errorHandler,
4587 "utf7", "unterminated shift sequence",
4588 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004589 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 goto onError;
4591 if (s < e)
4592 goto restart;
4593 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004594 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004595
4596 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004597 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004598 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004599 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004600 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004601 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004602 writer.kind, writer.data, shiftOutStart);
4603 Py_XDECREF(errorHandler);
4604 Py_XDECREF(exc);
4605 _PyUnicodeWriter_Dealloc(&writer);
4606 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004607 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004608 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609 }
4610 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004611 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004613 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004614
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004615 Py_XDECREF(errorHandler);
4616 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004617 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618
Benjamin Peterson29060642009-01-31 22:14:21 +00004619 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620 Py_XDECREF(errorHandler);
4621 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004622 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004623 return NULL;
4624}
4625
4626
Alexander Belopolsky40018472011-02-26 01:02:56 +00004627PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004628_PyUnicode_EncodeUTF7(PyObject *str,
4629 int base64SetO,
4630 int base64WhiteSpace,
4631 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004632{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004633 int kind;
4634 void *data;
4635 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004636 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004638 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004639 unsigned int base64bits = 0;
4640 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004641 char * out;
4642 char * start;
4643
Benjamin Petersonbac79492012-01-14 13:34:47 -05004644 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004645 return NULL;
4646 kind = PyUnicode_KIND(str);
4647 data = PyUnicode_DATA(str);
4648 len = PyUnicode_GET_LENGTH(str);
4649
4650 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004651 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004652
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004653 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004654 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004655 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004656 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004657 if (v == NULL)
4658 return NULL;
4659
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004660 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004661 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004662 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004663
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664 if (inShift) {
4665 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4666 /* shifting out */
4667 if (base64bits) { /* output remaining bits */
4668 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4669 base64buffer = 0;
4670 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004671 }
4672 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673 /* Characters not in the BASE64 set implicitly unshift the sequence
4674 so no '-' is required, except if the character is itself a '-' */
4675 if (IS_BASE64(ch) || ch == '-') {
4676 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004677 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678 *out++ = (char) ch;
4679 }
4680 else {
4681 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004682 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004683 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004684 else { /* not in a shift sequence */
4685 if (ch == '+') {
4686 *out++ = '+';
4687 *out++ = '-';
4688 }
4689 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4690 *out++ = (char) ch;
4691 }
4692 else {
4693 *out++ = '+';
4694 inShift = 1;
4695 goto encode_char;
4696 }
4697 }
4698 continue;
4699encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004700 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004701 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004702
Antoine Pitrou244651a2009-05-04 18:56:13 +00004703 /* code first surrogate */
4704 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004705 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004706 while (base64bits >= 6) {
4707 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4708 base64bits -= 6;
4709 }
4710 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004711 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004712 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004713 base64bits += 16;
4714 base64buffer = (base64buffer << 16) | ch;
4715 while (base64bits >= 6) {
4716 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4717 base64bits -= 6;
4718 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004719 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004720 if (base64bits)
4721 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4722 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004723 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004724 if (_PyBytes_Resize(&v, out - start) < 0)
4725 return NULL;
4726 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004727}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004728PyObject *
4729PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4730 Py_ssize_t size,
4731 int base64SetO,
4732 int base64WhiteSpace,
4733 const char *errors)
4734{
4735 PyObject *result;
4736 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4737 if (tmp == NULL)
4738 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004739 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004740 base64WhiteSpace, errors);
4741 Py_DECREF(tmp);
4742 return result;
4743}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004744
Antoine Pitrou244651a2009-05-04 18:56:13 +00004745#undef IS_BASE64
4746#undef FROM_BASE64
4747#undef TO_BASE64
4748#undef DECODE_DIRECT
4749#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004750
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751/* --- UTF-8 Codec -------------------------------------------------------- */
4752
Alexander Belopolsky40018472011-02-26 01:02:56 +00004753PyObject *
4754PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004755 Py_ssize_t size,
4756 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757{
Walter Dörwald69652032004-09-07 20:24:22 +00004758 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4759}
4760
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004761#include "stringlib/asciilib.h"
4762#include "stringlib/codecs.h"
4763#include "stringlib/undef.h"
4764
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004765#include "stringlib/ucs1lib.h"
4766#include "stringlib/codecs.h"
4767#include "stringlib/undef.h"
4768
4769#include "stringlib/ucs2lib.h"
4770#include "stringlib/codecs.h"
4771#include "stringlib/undef.h"
4772
4773#include "stringlib/ucs4lib.h"
4774#include "stringlib/codecs.h"
4775#include "stringlib/undef.h"
4776
Antoine Pitrouab868312009-01-10 15:40:25 +00004777/* Mask to quickly check whether a C 'long' contains a
4778 non-ASCII, UTF8-encoded char. */
4779#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004780# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004781#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004782# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004783#else
4784# error C 'long' size should be either 4 or 8!
4785#endif
4786
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004787static Py_ssize_t
4788ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004789{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004790 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004791 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004792
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004793 /*
4794 * Issue #17237: m68k is a bit different from most architectures in
4795 * that objects do not use "natural alignment" - for example, int and
4796 * long are only aligned at 2-byte boundaries. Therefore the assert()
4797 * won't work; also, tests have shown that skipping the "optimised
4798 * version" will even speed up m68k.
4799 */
4800#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004801#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004802 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4803 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004804 /* Fast path, see in STRINGLIB(utf8_decode) for
4805 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004806 /* Help allocation */
4807 const char *_p = p;
4808 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004809 while (_p < aligned_end) {
4810 unsigned long value = *(const unsigned long *) _p;
4811 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004812 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004813 *((unsigned long *)q) = value;
4814 _p += SIZEOF_LONG;
4815 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004816 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004817 p = _p;
4818 while (p < end) {
4819 if ((unsigned char)*p & 0x80)
4820 break;
4821 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004823 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004825#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004826#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004827 while (p < end) {
4828 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4829 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004830 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004831 /* Help allocation */
4832 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004833 while (_p < aligned_end) {
4834 unsigned long value = *(unsigned long *) _p;
4835 if (value & ASCII_CHAR_MASK)
4836 break;
4837 _p += SIZEOF_LONG;
4838 }
4839 p = _p;
4840 if (_p == end)
4841 break;
4842 }
4843 if ((unsigned char)*p & 0x80)
4844 break;
4845 ++p;
4846 }
4847 memcpy(dest, start, p - start);
4848 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849}
Antoine Pitrouab868312009-01-10 15:40:25 +00004850
Victor Stinner785938e2011-12-11 20:09:03 +01004851PyObject *
4852PyUnicode_DecodeUTF8Stateful(const char *s,
4853 Py_ssize_t size,
4854 const char *errors,
4855 Py_ssize_t *consumed)
4856{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004857 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004858 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004860
4861 Py_ssize_t startinpos;
4862 Py_ssize_t endinpos;
4863 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004864 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004865 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004866 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004867
4868 if (size == 0) {
4869 if (consumed)
4870 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004871 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004872 }
4873
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004874 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4875 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004876 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004877 *consumed = 1;
4878 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004879 }
4880
Victor Stinner8f674cc2013-04-17 23:02:17 +02004881 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004882 writer.min_length = size;
4883 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004884 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004885
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004886 writer.pos = ascii_decode(s, end, writer.data);
4887 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 while (s < end) {
4889 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004890 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004891
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004892 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004893 if (PyUnicode_IS_ASCII(writer.buffer))
4894 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004895 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004896 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004897 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004898 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004899 } else {
4900 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004901 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004902 }
4903
4904 switch (ch) {
4905 case 0:
4906 if (s == end || consumed)
4907 goto End;
4908 errmsg = "unexpected end of data";
4909 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004910 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911 break;
4912 case 1:
4913 errmsg = "invalid start byte";
4914 startinpos = s - starts;
4915 endinpos = startinpos + 1;
4916 break;
4917 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004918 case 3:
4919 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004920 errmsg = "invalid continuation byte";
4921 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004922 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004923 break;
4924 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004925 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004926 goto onError;
4927 continue;
4928 }
4929
Victor Stinner1d65d912015-10-05 13:43:50 +02004930 if (error_handler == _Py_ERROR_UNKNOWN)
4931 error_handler = get_error_handler(errors);
4932
4933 switch (error_handler) {
4934 case _Py_ERROR_IGNORE:
4935 s += (endinpos - startinpos);
4936 break;
4937
4938 case _Py_ERROR_REPLACE:
4939 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4940 goto onError;
4941 s += (endinpos - startinpos);
4942 break;
4943
4944 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004945 {
4946 Py_ssize_t i;
4947
Victor Stinner1d65d912015-10-05 13:43:50 +02004948 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4949 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004950 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004951 ch = (Py_UCS4)(unsigned char)(starts[i]);
4952 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4953 ch + 0xdc00);
4954 writer.pos++;
4955 }
4956 s += (endinpos - startinpos);
4957 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004958 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004959
4960 default:
4961 if (unicode_decode_call_errorhandler_writer(
4962 errors, &error_handler_obj,
4963 "utf-8", errmsg,
4964 &starts, &end, &startinpos, &endinpos, &exc, &s,
4965 &writer))
4966 goto onError;
4967 }
Victor Stinner785938e2011-12-11 20:09:03 +01004968 }
4969
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004970End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004971 if (consumed)
4972 *consumed = s - starts;
4973
Victor Stinner1d65d912015-10-05 13:43:50 +02004974 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004975 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004976 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004977
4978onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004979 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004980 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004981 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004982 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004983}
4984
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004985#ifdef __APPLE__
4986
4987/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004988 used to decode the command line arguments on Mac OS X.
4989
4990 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004991 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004992
4993wchar_t*
4994_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4995{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004996 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004997 wchar_t *unicode;
4998 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004999
5000 /* Note: size will always be longer than the resulting Unicode
5001 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005002 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005003 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005004 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005005 if (!unicode)
5006 return NULL;
5007
5008 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005009 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005010 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005011 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005013#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005014 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005015#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005016 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005017#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005018 if (ch > 0xFF) {
5019#if SIZEOF_WCHAR_T == 4
5020 assert(0);
5021#else
5022 assert(Py_UNICODE_IS_SURROGATE(ch));
5023 /* compute and append the two surrogates: */
5024 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5025 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5026#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005027 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005028 else {
5029 if (!ch && s == e)
5030 break;
5031 /* surrogateescape */
5032 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5033 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005034 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005035 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005036 return unicode;
5037}
5038
5039#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005041/* Primary internal function which creates utf8 encoded bytes objects.
5042
5043 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005044 and allocate exactly as much space needed at the end. Else allocate the
5045 maximum possible needed (4 result bytes per Unicode character), and return
5046 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005047*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005048PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005049_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050{
Victor Stinner6099a032011-12-18 14:22:26 +01005051 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005052 void *data;
5053 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005055 if (!PyUnicode_Check(unicode)) {
5056 PyErr_BadArgument();
5057 return NULL;
5058 }
5059
5060 if (PyUnicode_READY(unicode) == -1)
5061 return NULL;
5062
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005063 if (PyUnicode_UTF8(unicode))
5064 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5065 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005066
5067 kind = PyUnicode_KIND(unicode);
5068 data = PyUnicode_DATA(unicode);
5069 size = PyUnicode_GET_LENGTH(unicode);
5070
Benjamin Petersonead6b532011-12-20 17:23:42 -06005071 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005072 default:
5073 assert(0);
5074 case PyUnicode_1BYTE_KIND:
5075 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5076 assert(!PyUnicode_IS_ASCII(unicode));
5077 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5078 case PyUnicode_2BYTE_KIND:
5079 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5080 case PyUnicode_4BYTE_KIND:
5081 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005082 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083}
5084
Alexander Belopolsky40018472011-02-26 01:02:56 +00005085PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005086PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5087 Py_ssize_t size,
5088 const char *errors)
5089{
5090 PyObject *v, *unicode;
5091
5092 unicode = PyUnicode_FromUnicode(s, size);
5093 if (unicode == NULL)
5094 return NULL;
5095 v = _PyUnicode_AsUTF8String(unicode, errors);
5096 Py_DECREF(unicode);
5097 return v;
5098}
5099
5100PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005101PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005103 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104}
5105
Walter Dörwald41980ca2007-08-16 21:55:45 +00005106/* --- UTF-32 Codec ------------------------------------------------------- */
5107
5108PyObject *
5109PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005110 Py_ssize_t size,
5111 const char *errors,
5112 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005113{
5114 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5115}
5116
5117PyObject *
5118PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005119 Py_ssize_t size,
5120 const char *errors,
5121 int *byteorder,
5122 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005123{
5124 const char *starts = s;
5125 Py_ssize_t startinpos;
5126 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005127 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005128 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005129 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005130 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005131 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005132 PyObject *errorHandler = NULL;
5133 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005134
Walter Dörwald41980ca2007-08-16 21:55:45 +00005135 q = (unsigned char *)s;
5136 e = q + size;
5137
5138 if (byteorder)
5139 bo = *byteorder;
5140
5141 /* Check for BOM marks (U+FEFF) in the input and adjust current
5142 byte order setting accordingly. In native mode, the leading BOM
5143 mark is skipped, in all other modes, it is copied to the output
5144 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005145 if (bo == 0 && size >= 4) {
5146 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5147 if (bom == 0x0000FEFF) {
5148 bo = -1;
5149 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005150 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005151 else if (bom == 0xFFFE0000) {
5152 bo = 1;
5153 q += 4;
5154 }
5155 if (byteorder)
5156 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005157 }
5158
Victor Stinnere64322e2012-10-30 23:12:47 +01005159 if (q == e) {
5160 if (consumed)
5161 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005162 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005163 }
5164
Victor Stinnere64322e2012-10-30 23:12:47 +01005165#ifdef WORDS_BIGENDIAN
5166 le = bo < 0;
5167#else
5168 le = bo <= 0;
5169#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005170 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005171
Victor Stinner8f674cc2013-04-17 23:02:17 +02005172 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005173 writer.min_length = (e - q + 3) / 4;
5174 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005175 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005176
Victor Stinnere64322e2012-10-30 23:12:47 +01005177 while (1) {
5178 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005179 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005180
Victor Stinnere64322e2012-10-30 23:12:47 +01005181 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005182 enum PyUnicode_Kind kind = writer.kind;
5183 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005184 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005185 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005186 if (le) {
5187 do {
5188 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5189 if (ch > maxch)
5190 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005191 if (kind != PyUnicode_1BYTE_KIND &&
5192 Py_UNICODE_IS_SURROGATE(ch))
5193 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005194 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005195 q += 4;
5196 } while (q <= last);
5197 }
5198 else {
5199 do {
5200 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5201 if (ch > maxch)
5202 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005203 if (kind != PyUnicode_1BYTE_KIND &&
5204 Py_UNICODE_IS_SURROGATE(ch))
5205 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005206 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005207 q += 4;
5208 } while (q <= last);
5209 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005210 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005211 }
5212
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005213 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005214 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005215 startinpos = ((const char *)q) - starts;
5216 endinpos = startinpos + 4;
5217 }
5218 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005219 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005221 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005222 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005223 startinpos = ((const char *)q) - starts;
5224 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005225 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005226 else {
5227 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005228 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005229 goto onError;
5230 q += 4;
5231 continue;
5232 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005233 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005234 startinpos = ((const char *)q) - starts;
5235 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005237
5238 /* The remaining input chars are ignored if the callback
5239 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005240 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005242 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005244 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005246 }
5247
Walter Dörwald41980ca2007-08-16 21:55:45 +00005248 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005250
Walter Dörwald41980ca2007-08-16 21:55:45 +00005251 Py_XDECREF(errorHandler);
5252 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005253 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005254
Benjamin Peterson29060642009-01-31 22:14:21 +00005255 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005256 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005257 Py_XDECREF(errorHandler);
5258 Py_XDECREF(exc);
5259 return NULL;
5260}
5261
5262PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005263_PyUnicode_EncodeUTF32(PyObject *str,
5264 const char *errors,
5265 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005266{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005267 enum PyUnicode_Kind kind;
5268 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005269 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005270 PyObject *v;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005271 PY_UINT32_T *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005272#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005273 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005274#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005275 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005276#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005277 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005278 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005279 PyObject *errorHandler = NULL;
5280 PyObject *exc = NULL;
5281 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005282
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005283 if (!PyUnicode_Check(str)) {
5284 PyErr_BadArgument();
5285 return NULL;
5286 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005287 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005288 return NULL;
5289 kind = PyUnicode_KIND(str);
5290 data = PyUnicode_DATA(str);
5291 len = PyUnicode_GET_LENGTH(str);
5292
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005293 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005294 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005295 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005296 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005297 if (v == NULL)
5298 return NULL;
5299
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005300 /* output buffer is 4-bytes aligned */
5301 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5302 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005303 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005304 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005305 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005306 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005307
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005308 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005309 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005310 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005311 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005312 else
5313 encoding = "utf-32";
5314
5315 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005316 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5317 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005318 }
5319
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005320 pos = 0;
5321 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005322 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005323
5324 if (kind == PyUnicode_2BYTE_KIND) {
5325 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5326 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005327 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005328 else {
5329 assert(kind == PyUnicode_4BYTE_KIND);
5330 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5331 &out, native_ordering);
5332 }
5333 if (pos == len)
5334 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005335
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005336 rep = unicode_encode_call_errorhandler(
5337 errors, &errorHandler,
5338 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005339 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005340 if (!rep)
5341 goto error;
5342
5343 if (PyBytes_Check(rep)) {
5344 repsize = PyBytes_GET_SIZE(rep);
5345 if (repsize & 3) {
5346 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005347 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005348 "surrogates not allowed");
5349 goto error;
5350 }
5351 moreunits = repsize / 4;
5352 }
5353 else {
5354 assert(PyUnicode_Check(rep));
5355 if (PyUnicode_READY(rep) < 0)
5356 goto error;
5357 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5358 if (!PyUnicode_IS_ASCII(rep)) {
5359 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005360 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005361 "surrogates not allowed");
5362 goto error;
5363 }
5364 }
5365
5366 /* four bytes are reserved for each surrogate */
5367 if (moreunits > 1) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005368 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005369 Py_ssize_t morebytes = 4 * (moreunits - 1);
5370 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5371 /* integer overflow */
5372 PyErr_NoMemory();
5373 goto error;
5374 }
5375 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5376 goto error;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005377 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005378 }
5379
5380 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005381 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5382 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005383 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005384 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005385 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5386 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005387 }
5388
5389 Py_CLEAR(rep);
5390 }
5391
5392 /* Cut back to size actually needed. This is necessary for, for example,
5393 encoding of a string containing isolated surrogates and the 'ignore'
5394 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005395 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005396 if (nsize != PyBytes_GET_SIZE(v))
5397 _PyBytes_Resize(&v, nsize);
5398 Py_XDECREF(errorHandler);
5399 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005400 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005401 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005402 error:
5403 Py_XDECREF(rep);
5404 Py_XDECREF(errorHandler);
5405 Py_XDECREF(exc);
5406 Py_XDECREF(v);
5407 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005408}
5409
Alexander Belopolsky40018472011-02-26 01:02:56 +00005410PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005411PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5412 Py_ssize_t size,
5413 const char *errors,
5414 int byteorder)
5415{
5416 PyObject *result;
5417 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5418 if (tmp == NULL)
5419 return NULL;
5420 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5421 Py_DECREF(tmp);
5422 return result;
5423}
5424
5425PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005426PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005427{
Victor Stinnerb960b342011-11-20 19:12:52 +01005428 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005429}
5430
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431/* --- UTF-16 Codec ------------------------------------------------------- */
5432
Tim Peters772747b2001-08-09 22:21:55 +00005433PyObject *
5434PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005435 Py_ssize_t size,
5436 const char *errors,
5437 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438{
Walter Dörwald69652032004-09-07 20:24:22 +00005439 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5440}
5441
5442PyObject *
5443PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 Py_ssize_t size,
5445 const char *errors,
5446 int *byteorder,
5447 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005448{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005449 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005450 Py_ssize_t startinpos;
5451 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005452 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005453 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005454 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005455 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005456 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005457 PyObject *errorHandler = NULL;
5458 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005459 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460
Tim Peters772747b2001-08-09 22:21:55 +00005461 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005462 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463
5464 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005465 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005467 /* Check for BOM marks (U+FEFF) in the input and adjust current
5468 byte order setting accordingly. In native mode, the leading BOM
5469 mark is skipped, in all other modes, it is copied to the output
5470 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005471 if (bo == 0 && size >= 2) {
5472 const Py_UCS4 bom = (q[1] << 8) | q[0];
5473 if (bom == 0xFEFF) {
5474 q += 2;
5475 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005477 else if (bom == 0xFFFE) {
5478 q += 2;
5479 bo = 1;
5480 }
5481 if (byteorder)
5482 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005483 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484
Antoine Pitrou63065d72012-05-15 23:48:04 +02005485 if (q == e) {
5486 if (consumed)
5487 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005488 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005489 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005490
Christian Heimes743e0cd2012-10-17 23:52:17 +02005491#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005492 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005493 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005494#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005495 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005496 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005497#endif
Tim Peters772747b2001-08-09 22:21:55 +00005498
Antoine Pitrou63065d72012-05-15 23:48:04 +02005499 /* Note: size will always be longer than the resulting Unicode
5500 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005501 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005502 writer.min_length = (e - q + 1) / 2;
5503 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005504 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005505
Antoine Pitrou63065d72012-05-15 23:48:04 +02005506 while (1) {
5507 Py_UCS4 ch = 0;
5508 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005509 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005510 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005511 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005512 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005513 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005514 native_ordering);
5515 else
5516 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005517 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005518 native_ordering);
5519 } else if (kind == PyUnicode_2BYTE_KIND) {
5520 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005521 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005522 native_ordering);
5523 } else {
5524 assert(kind == PyUnicode_4BYTE_KIND);
5525 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005526 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005527 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005528 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005529 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005530
Antoine Pitrou63065d72012-05-15 23:48:04 +02005531 switch (ch)
5532 {
5533 case 0:
5534 /* remaining byte at the end? (size should be even) */
5535 if (q == e || consumed)
5536 goto End;
5537 errmsg = "truncated data";
5538 startinpos = ((const char *)q) - starts;
5539 endinpos = ((const char *)e) - starts;
5540 break;
5541 /* The remaining input chars are ignored if the callback
5542 chooses to skip the input */
5543 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005544 q -= 2;
5545 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005546 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005547 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005548 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005549 endinpos = ((const char *)e) - starts;
5550 break;
5551 case 2:
5552 errmsg = "illegal encoding";
5553 startinpos = ((const char *)q) - 2 - starts;
5554 endinpos = startinpos + 2;
5555 break;
5556 case 3:
5557 errmsg = "illegal UTF-16 surrogate";
5558 startinpos = ((const char *)q) - 4 - starts;
5559 endinpos = startinpos + 2;
5560 break;
5561 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005562 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005563 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 continue;
5565 }
5566
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005567 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005568 errors,
5569 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005570 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005571 &starts,
5572 (const char **)&e,
5573 &startinpos,
5574 &endinpos,
5575 &exc,
5576 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005577 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005578 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579 }
5580
Antoine Pitrou63065d72012-05-15 23:48:04 +02005581End:
Walter Dörwald69652032004-09-07 20:24:22 +00005582 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005583 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005584
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005585 Py_XDECREF(errorHandler);
5586 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005587 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005590 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591 Py_XDECREF(errorHandler);
5592 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 return NULL;
5594}
5595
Tim Peters772747b2001-08-09 22:21:55 +00005596PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005597_PyUnicode_EncodeUTF16(PyObject *str,
5598 const char *errors,
5599 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005601 enum PyUnicode_Kind kind;
5602 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005603 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005604 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005605 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005606 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005607#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005608 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005609#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005610 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005611#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005612 const char *encoding;
5613 Py_ssize_t nsize, pos;
5614 PyObject *errorHandler = NULL;
5615 PyObject *exc = NULL;
5616 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005617
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005618 if (!PyUnicode_Check(str)) {
5619 PyErr_BadArgument();
5620 return NULL;
5621 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005622 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005623 return NULL;
5624 kind = PyUnicode_KIND(str);
5625 data = PyUnicode_DATA(str);
5626 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005627
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005628 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005629 if (kind == PyUnicode_4BYTE_KIND) {
5630 const Py_UCS4 *in = (const Py_UCS4 *)data;
5631 const Py_UCS4 *end = in + len;
5632 while (in < end)
5633 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005634 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005635 }
5636 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005638 nsize = len + pairs + (byteorder == 0);
5639 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 if (v == NULL)
5641 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005643 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005644 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005645 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005647 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005648 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005649 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005650
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005651 if (kind == PyUnicode_1BYTE_KIND) {
5652 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5653 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005654 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005655
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005656 if (byteorder < 0)
5657 encoding = "utf-16-le";
5658 else if (byteorder > 0)
5659 encoding = "utf-16-be";
5660 else
5661 encoding = "utf-16";
5662
5663 pos = 0;
5664 while (pos < len) {
5665 Py_ssize_t repsize, moreunits;
5666
5667 if (kind == PyUnicode_2BYTE_KIND) {
5668 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5669 &out, native_ordering);
5670 }
5671 else {
5672 assert(kind == PyUnicode_4BYTE_KIND);
5673 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5674 &out, native_ordering);
5675 }
5676 if (pos == len)
5677 break;
5678
5679 rep = unicode_encode_call_errorhandler(
5680 errors, &errorHandler,
5681 encoding, "surrogates not allowed",
5682 str, &exc, pos, pos + 1, &pos);
5683 if (!rep)
5684 goto error;
5685
5686 if (PyBytes_Check(rep)) {
5687 repsize = PyBytes_GET_SIZE(rep);
5688 if (repsize & 1) {
5689 raise_encode_exception(&exc, encoding,
5690 str, pos - 1, pos,
5691 "surrogates not allowed");
5692 goto error;
5693 }
5694 moreunits = repsize / 2;
5695 }
5696 else {
5697 assert(PyUnicode_Check(rep));
5698 if (PyUnicode_READY(rep) < 0)
5699 goto error;
5700 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5701 if (!PyUnicode_IS_ASCII(rep)) {
5702 raise_encode_exception(&exc, encoding,
5703 str, pos - 1, pos,
5704 "surrogates not allowed");
5705 goto error;
5706 }
5707 }
5708
5709 /* two bytes are reserved for each surrogate */
5710 if (moreunits > 1) {
5711 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5712 Py_ssize_t morebytes = 2 * (moreunits - 1);
5713 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5714 /* integer overflow */
5715 PyErr_NoMemory();
5716 goto error;
5717 }
5718 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5719 goto error;
5720 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5721 }
5722
5723 if (PyBytes_Check(rep)) {
5724 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5725 out += moreunits;
5726 } else /* rep is unicode */ {
5727 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5728 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5729 &out, native_ordering);
5730 }
5731
5732 Py_CLEAR(rep);
5733 }
5734
5735 /* Cut back to size actually needed. This is necessary for, for example,
5736 encoding of a string containing isolated surrogates and the 'ignore' handler
5737 is used. */
5738 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5739 if (nsize != PyBytes_GET_SIZE(v))
5740 _PyBytes_Resize(&v, nsize);
5741 Py_XDECREF(errorHandler);
5742 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005743 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005744 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005745 error:
5746 Py_XDECREF(rep);
5747 Py_XDECREF(errorHandler);
5748 Py_XDECREF(exc);
5749 Py_XDECREF(v);
5750 return NULL;
5751#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752}
5753
Alexander Belopolsky40018472011-02-26 01:02:56 +00005754PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005755PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5756 Py_ssize_t size,
5757 const char *errors,
5758 int byteorder)
5759{
5760 PyObject *result;
5761 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5762 if (tmp == NULL)
5763 return NULL;
5764 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5765 Py_DECREF(tmp);
5766 return result;
5767}
5768
5769PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005770PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005772 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773}
5774
5775/* --- Unicode Escape Codec ----------------------------------------------- */
5776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005777/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5778 if all the escapes in the string make it still a valid ASCII string.
5779 Returns -1 if any escapes were found which cause the string to
5780 pop out of ASCII range. Otherwise returns the length of the
5781 required buffer to hold the string.
5782 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005783static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005784length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5785{
5786 const unsigned char *p = (const unsigned char *)s;
5787 const unsigned char *end = p + size;
5788 Py_ssize_t length = 0;
5789
5790 if (size < 0)
5791 return -1;
5792
5793 for (; p < end; ++p) {
5794 if (*p > 127) {
5795 /* Non-ASCII */
5796 return -1;
5797 }
5798 else if (*p != '\\') {
5799 /* Normal character */
5800 ++length;
5801 }
5802 else {
5803 /* Backslash-escape, check next char */
5804 ++p;
5805 /* Escape sequence reaches till end of string or
5806 non-ASCII follow-up. */
5807 if (p >= end || *p > 127)
5808 return -1;
5809 switch (*p) {
5810 case '\n':
5811 /* backslash + \n result in zero characters */
5812 break;
5813 case '\\': case '\'': case '\"':
5814 case 'b': case 'f': case 't':
5815 case 'n': case 'r': case 'v': case 'a':
5816 ++length;
5817 break;
5818 case '0': case '1': case '2': case '3':
5819 case '4': case '5': case '6': case '7':
5820 case 'x': case 'u': case 'U': case 'N':
5821 /* these do not guarantee ASCII characters */
5822 return -1;
5823 default:
5824 /* count the backslash + the other character */
5825 length += 2;
5826 }
5827 }
5828 }
5829 return length;
5830}
5831
Fredrik Lundh06d12682001-01-24 07:59:11 +00005832static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005833
Alexander Belopolsky40018472011-02-26 01:02:56 +00005834PyObject *
5835PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005836 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005837 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005840 Py_ssize_t startinpos;
5841 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005842 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005844 char* message;
5845 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005846 PyObject *errorHandler = NULL;
5847 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005848 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005849
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005850 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005851 if (len == 0)
5852 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005853
5854 /* After length_of_escaped_ascii_string() there are two alternatives,
5855 either the string is pure ASCII with named escapes like \n, etc.
5856 and we determined it's exact size (common case)
5857 or it contains \x, \u, ... escape sequences. then we create a
5858 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005859 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005860 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005861 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005862 }
5863 else {
5864 /* Escaped strings will always be longer than the resulting
5865 Unicode string, so we start with size here and then reduce the
5866 length after conversion to the true value.
5867 (but if the error callback returns a long replacement string
5868 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005869 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005870 }
5871
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005873 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005875
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 while (s < end) {
5877 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005878 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005879 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880
5881 /* Non-escape characters are interpreted as Unicode ordinals */
5882 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005883 x = (unsigned char)*s;
5884 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005885 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005886 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887 continue;
5888 }
5889
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005890 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 /* \ - Escapes */
5892 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005893 c = *s++;
5894 if (s > end)
5895 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005896
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005897 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005900#define WRITECHAR(ch) \
5901 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005902 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005903 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005904 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005905
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005907 case '\\': WRITECHAR('\\'); break;
5908 case '\'': WRITECHAR('\''); break;
5909 case '\"': WRITECHAR('\"'); break;
5910 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005911 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005912 case 'f': WRITECHAR('\014'); break;
5913 case 't': WRITECHAR('\t'); break;
5914 case 'n': WRITECHAR('\n'); break;
5915 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005916 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005917 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005918 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005919 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920
Benjamin Peterson29060642009-01-31 22:14:21 +00005921 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 case '0': case '1': case '2': case '3':
5923 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005924 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005925 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005926 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005927 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005928 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005930 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 break;
5932
Benjamin Peterson29060642009-01-31 22:14:21 +00005933 /* hex escapes */
5934 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005936 digits = 2;
5937 message = "truncated \\xXX escape";
5938 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005942 digits = 4;
5943 message = "truncated \\uXXXX escape";
5944 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005947 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005948 digits = 8;
5949 message = "truncated \\UXXXXXXXX escape";
5950 hexescape:
5951 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005952 if (end - s < digits) {
5953 /* count only hex digits */
5954 for (; s < end; ++s) {
5955 c = (unsigned char)*s;
5956 if (!Py_ISXDIGIT(c))
5957 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005958 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005959 goto error;
5960 }
5961 for (; digits--; ++s) {
5962 c = (unsigned char)*s;
5963 if (!Py_ISXDIGIT(c))
5964 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005965 chr = (chr<<4) & ~0xF;
5966 if (c >= '0' && c <= '9')
5967 chr += c - '0';
5968 else if (c >= 'a' && c <= 'f')
5969 chr += 10 + c - 'a';
5970 else
5971 chr += 10 + c - 'A';
5972 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005973 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005974 /* _decoding_error will have already written into the
5975 target buffer. */
5976 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005977 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005978 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005979 message = "illegal Unicode character";
5980 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005981 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005982 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005983 break;
5984
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005986 case 'N':
5987 message = "malformed \\N character escape";
5988 if (ucnhash_CAPI == NULL) {
5989 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005990 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5991 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005992 if (ucnhash_CAPI == NULL)
5993 goto ucnhashError;
5994 }
5995 if (*s == '{') {
5996 const char *start = s+1;
5997 /* look for the closing brace */
5998 while (*s != '}' && s < end)
5999 s++;
6000 if (s > start && s < end && *s == '}') {
6001 /* found a name. look it up in the unicode database */
6002 message = "unknown Unicode character name";
6003 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02006004 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02006005 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006006 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006007 goto store;
6008 }
6009 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006010 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006011
6012 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006013 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006014 message = "\\ at end of string";
6015 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02006016 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00006017 }
6018 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006019 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02006020 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006021 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006022 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006024 continue;
6025
6026 error:
6027 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006028 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006029 errors, &errorHandler,
6030 "unicodeescape", message,
6031 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006032 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02006033 goto onError;
6034 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006036#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006037
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006038 Py_XDECREF(errorHandler);
6039 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006040 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006041
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006043 PyErr_SetString(
6044 PyExc_UnicodeError,
6045 "\\N escapes not supported (can't load unicodedata module)"
6046 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006047 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006048 Py_XDECREF(errorHandler);
6049 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006050 return NULL;
6051
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006053 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006054 Py_XDECREF(errorHandler);
6055 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 return NULL;
6057}
6058
6059/* Return a Unicode-Escape string version of the Unicode object.
6060
6061 If quotes is true, the string is enclosed in u"" or u'' quotes as
6062 appropriate.
6063
6064*/
6065
Alexander Belopolsky40018472011-02-26 01:02:56 +00006066PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006067PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006069 Py_ssize_t i, len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006071 int kind;
6072 void *data;
Victor Stinner358af132015-10-12 22:36:57 +02006073 _PyBytesWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074
Ezio Melottie7f90372012-10-05 03:33:31 +03006075 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006076 escape.
6077
Ezio Melottie7f90372012-10-05 03:33:31 +03006078 For UCS1 strings it's '\xxx', 4 bytes per source character.
6079 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6080 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006081 */
6082
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006083 if (!PyUnicode_Check(unicode)) {
6084 PyErr_BadArgument();
6085 return NULL;
6086 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006087 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006088 return NULL;
Victor Stinner358af132015-10-12 22:36:57 +02006089
6090 _PyBytesWriter_Init(&writer);
6091
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006092 len = PyUnicode_GET_LENGTH(unicode);
6093 kind = PyUnicode_KIND(unicode);
6094 data = PyUnicode_DATA(unicode);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006095
Victor Stinner358af132015-10-12 22:36:57 +02006096 p = _PyBytesWriter_Alloc(&writer, len);
6097 if (p == NULL)
6098 goto error;
6099 writer.overallocate = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006101 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006102 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006103
Walter Dörwald79e913e2007-05-12 11:08:06 +00006104 /* Escape backslashes */
6105 if (ch == '\\') {
Victor Stinner358af132015-10-12 22:36:57 +02006106 /* -1: substract 1 preallocated byte */
6107 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6108 if (p == NULL)
6109 goto error;
6110
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 *p++ = '\\';
6112 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006113 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006114 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006115
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006116 /* Map 21-bit characters to '\U00xxxxxx' */
6117 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006118 assert(ch <= MAX_UNICODE);
Victor Stinner358af132015-10-12 22:36:57 +02006119
6120 p = _PyBytesWriter_Prepare(&writer, p, 10-1);
6121 if (p == NULL)
6122 goto error;
6123
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006124 *p++ = '\\';
6125 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006126 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6127 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6128 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6129 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6130 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6131 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6132 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6133 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006135 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006136
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006138 if (ch >= 256) {
Victor Stinner358af132015-10-12 22:36:57 +02006139 p = _PyBytesWriter_Prepare(&writer, p, 6-1);
6140 if (p == NULL)
6141 goto error;
6142
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 *p++ = '\\';
6144 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006145 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6146 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6147 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6148 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006150
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006151 /* Map special whitespace to '\t', \n', '\r' */
6152 else if (ch == '\t') {
Victor Stinner358af132015-10-12 22:36:57 +02006153 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6154 if (p == NULL)
6155 goto error;
6156
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006157 *p++ = '\\';
6158 *p++ = 't';
6159 }
6160 else if (ch == '\n') {
Victor Stinner358af132015-10-12 22:36:57 +02006161 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6162 if (p == NULL)
6163 goto error;
6164
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006165 *p++ = '\\';
6166 *p++ = 'n';
6167 }
6168 else if (ch == '\r') {
Victor Stinner358af132015-10-12 22:36:57 +02006169 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6170 if (p == NULL)
6171 goto error;
6172
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006173 *p++ = '\\';
6174 *p++ = 'r';
6175 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006176
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006177 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006178 else if (ch < ' ' || ch >= 0x7F) {
Victor Stinner358af132015-10-12 22:36:57 +02006179 /* -1: substract 1 preallocated byte */
6180 p = _PyBytesWriter_Prepare(&writer, p, 4-1);
6181 if (p == NULL)
6182 goto error;
6183
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006185 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006186 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6187 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006188 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006189
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 /* Copy everything else as-is */
6191 else
6192 *p++ = (char) ch;
6193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194
Victor Stinner358af132015-10-12 22:36:57 +02006195 return _PyBytesWriter_Finish(&writer, p);
6196
6197error:
6198 _PyBytesWriter_Dealloc(&writer);
6199 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200}
6201
Alexander Belopolsky40018472011-02-26 01:02:56 +00006202PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006203PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6204 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006206 PyObject *result;
6207 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6208 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006210 result = PyUnicode_AsUnicodeEscapeString(tmp);
6211 Py_DECREF(tmp);
6212 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213}
6214
6215/* --- Raw Unicode Escape Codec ------------------------------------------- */
6216
Alexander Belopolsky40018472011-02-26 01:02:56 +00006217PyObject *
6218PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006219 Py_ssize_t size,
6220 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006222 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006223 Py_ssize_t startinpos;
6224 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006225 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 const char *end;
6227 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006228 PyObject *errorHandler = NULL;
6229 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006230
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006231 if (size == 0)
6232 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006233
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 /* Escaped strings will always be longer than the resulting
6235 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006236 length after conversion to the true value. (But decoding error
6237 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006238 _PyUnicodeWriter_Init(&writer);
6239 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006240
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 end = s + size;
6242 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006243 unsigned char c;
6244 Py_UCS4 x;
6245 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006246 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 /* Non-escape characters are interpreted as Unicode ordinals */
6249 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006250 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006251 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006252 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006254 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 startinpos = s-starts;
6256
6257 /* \u-escapes are only interpreted iff the number of leading
6258 backslashes if odd */
6259 bs = s;
6260 for (;s < end;) {
6261 if (*s != '\\')
6262 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006263 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006264 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006265 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 }
6267 if (((s - bs) & 1) == 0 ||
6268 s >= end ||
6269 (*s != 'u' && *s != 'U')) {
6270 continue;
6271 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006272 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 count = *s=='u' ? 4 : 8;
6274 s++;
6275
6276 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 for (x = 0, i = 0; i < count; ++i, ++s) {
6278 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006279 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006281 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 errors, &errorHandler,
6283 "rawunicodeescape", "truncated \\uXXXX",
6284 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006285 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 goto onError;
6287 goto nextByte;
6288 }
6289 x = (x<<4) & ~0xF;
6290 if (c >= '0' && c <= '9')
6291 x += c - '0';
6292 else if (c >= 'a' && c <= 'f')
6293 x += 10 + c - 'a';
6294 else
6295 x += 10 + c - 'A';
6296 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006297 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006298 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006299 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006300 }
6301 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006302 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006303 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006304 errors, &errorHandler,
6305 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006307 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006309 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 nextByte:
6311 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006313 Py_XDECREF(errorHandler);
6314 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006315 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006316
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006318 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006319 Py_XDECREF(errorHandler);
6320 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 return NULL;
6322}
6323
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006324
Alexander Belopolsky40018472011-02-26 01:02:56 +00006325PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006326PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328 char *p;
Victor Stinner358af132015-10-12 22:36:57 +02006329 Py_ssize_t pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006330 int kind;
6331 void *data;
6332 Py_ssize_t len;
Victor Stinner358af132015-10-12 22:36:57 +02006333 _PyBytesWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006335 if (!PyUnicode_Check(unicode)) {
6336 PyErr_BadArgument();
6337 return NULL;
6338 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006339 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006340 return NULL;
Victor Stinner358af132015-10-12 22:36:57 +02006341
6342 _PyBytesWriter_Init(&writer);
6343
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006344 kind = PyUnicode_KIND(unicode);
6345 data = PyUnicode_DATA(unicode);
6346 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner0e368262011-11-10 20:12:49 +01006347
Victor Stinner358af132015-10-12 22:36:57 +02006348 p = _PyBytesWriter_Alloc(&writer, len);
6349 if (p == NULL)
6350 goto error;
6351 writer.overallocate = 1;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006352
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006353 for (pos = 0; pos < len; pos++) {
6354 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 /* Map 32-bit characters to '\Uxxxxxxxx' */
6356 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006357 assert(ch <= MAX_UNICODE);
Victor Stinner358af132015-10-12 22:36:57 +02006358
6359 /* -1: substract 1 preallocated byte */
6360 p = _PyBytesWriter_Prepare(&writer, p, 10-1);
6361 if (p == NULL)
6362 goto error;
6363
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006364 *p++ = '\\';
6365 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006366 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6367 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6368 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6369 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6370 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6371 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6372 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6373 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006374 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006376 else if (ch >= 256) {
Victor Stinner358af132015-10-12 22:36:57 +02006377 /* -1: substract 1 preallocated byte */
6378 p = _PyBytesWriter_Prepare(&writer, p, 6-1);
6379 if (p == NULL)
6380 goto error;
6381
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382 *p++ = '\\';
6383 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006384 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6385 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6386 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6387 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 /* Copy everything else as-is */
6390 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 *p++ = (char) ch;
6392 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006393
Victor Stinner358af132015-10-12 22:36:57 +02006394 return _PyBytesWriter_Finish(&writer, p);
6395
6396error:
6397 _PyBytesWriter_Dealloc(&writer);
6398 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399}
6400
Alexander Belopolsky40018472011-02-26 01:02:56 +00006401PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006402PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6403 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006405 PyObject *result;
6406 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6407 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006408 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006409 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6410 Py_DECREF(tmp);
6411 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412}
6413
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006414/* --- Unicode Internal Codec ------------------------------------------- */
6415
Alexander Belopolsky40018472011-02-26 01:02:56 +00006416PyObject *
6417_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006418 Py_ssize_t size,
6419 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006420{
6421 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006422 Py_ssize_t startinpos;
6423 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006424 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006425 const char *end;
6426 const char *reason;
6427 PyObject *errorHandler = NULL;
6428 PyObject *exc = NULL;
6429
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006430 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006431 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006432 1))
6433 return NULL;
6434
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006435 if (size == 0)
6436 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006437
Victor Stinner8f674cc2013-04-17 23:02:17 +02006438 _PyUnicodeWriter_Init(&writer);
6439 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6440 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006442 }
6443 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006444
Victor Stinner8f674cc2013-04-17 23:02:17 +02006445 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006446 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006447 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006448 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006449 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006450 endinpos = end-starts;
6451 reason = "truncated input";
6452 goto error;
6453 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006454 /* We copy the raw representation one byte at a time because the
6455 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006456 ((char *) &uch)[0] = s[0];
6457 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006458#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006459 ((char *) &uch)[2] = s[2];
6460 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006461#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006462 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006463#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006464 /* We have to sanity check the raw data, otherwise doom looms for
6465 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006466 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006467 endinpos = s - starts + Py_UNICODE_SIZE;
6468 reason = "illegal code point (> 0x10FFFF)";
6469 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006470 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006471#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006472 s += Py_UNICODE_SIZE;
6473#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006474 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006475 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006476 Py_UNICODE uch2;
6477 ((char *) &uch2)[0] = s[0];
6478 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006479 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006480 {
Victor Stinner551ac952011-11-29 22:58:13 +01006481 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006482 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006483 }
6484 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006485#endif
6486
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006487 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006488 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006489 continue;
6490
6491 error:
6492 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006493 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006494 errors, &errorHandler,
6495 "unicode_internal", reason,
6496 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006497 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006498 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006499 }
6500
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006501 Py_XDECREF(errorHandler);
6502 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006503 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006504
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006506 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006507 Py_XDECREF(errorHandler);
6508 Py_XDECREF(exc);
6509 return NULL;
6510}
6511
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512/* --- Latin-1 Codec ------------------------------------------------------ */
6513
Alexander Belopolsky40018472011-02-26 01:02:56 +00006514PyObject *
6515PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006516 Py_ssize_t size,
6517 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006520 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521}
6522
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006523/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006524static void
6525make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006526 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006527 PyObject *unicode,
6528 Py_ssize_t startpos, Py_ssize_t endpos,
6529 const char *reason)
6530{
6531 if (*exceptionObject == NULL) {
6532 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006533 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006534 encoding, unicode, startpos, endpos, reason);
6535 }
6536 else {
6537 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6538 goto onError;
6539 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6540 goto onError;
6541 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6542 goto onError;
6543 return;
6544 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006545 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006546 }
6547}
6548
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006549/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006550static void
6551raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006552 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006553 PyObject *unicode,
6554 Py_ssize_t startpos, Py_ssize_t endpos,
6555 const char *reason)
6556{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006557 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006558 encoding, unicode, startpos, endpos, reason);
6559 if (*exceptionObject != NULL)
6560 PyCodec_StrictErrors(*exceptionObject);
6561}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006562
6563/* error handling callback helper:
6564 build arguments, call the callback and check the arguments,
6565 put the result into newpos and return the replacement string, which
6566 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006567static PyObject *
6568unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006569 PyObject **errorHandler,
6570 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006571 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006572 Py_ssize_t startpos, Py_ssize_t endpos,
6573 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006574{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006575 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006576 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006577 PyObject *restuple;
6578 PyObject *resunicode;
6579
6580 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006582 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006583 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006584 }
6585
Benjamin Petersonbac79492012-01-14 13:34:47 -05006586 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006587 return NULL;
6588 len = PyUnicode_GET_LENGTH(unicode);
6589
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006590 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006591 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006592 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006594
6595 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006597 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006599 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006600 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 Py_DECREF(restuple);
6602 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006603 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006604 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 &resunicode, newpos)) {
6606 Py_DECREF(restuple);
6607 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006608 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006609 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6610 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6611 Py_DECREF(restuple);
6612 return NULL;
6613 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006614 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006615 *newpos = len + *newpos;
6616 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006617 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 Py_DECREF(restuple);
6619 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006620 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006621 Py_INCREF(resunicode);
6622 Py_DECREF(restuple);
6623 return resunicode;
6624}
6625
Alexander Belopolsky40018472011-02-26 01:02:56 +00006626static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006627unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006628 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006629 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006630{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006631 /* input state */
6632 Py_ssize_t pos=0, size;
6633 int kind;
6634 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006635 /* pointer into the output */
6636 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006637 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6638 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006639 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006640 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006641 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006642 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006643 /* output object */
6644 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006645
Benjamin Petersonbac79492012-01-14 13:34:47 -05006646 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006647 return NULL;
6648 size = PyUnicode_GET_LENGTH(unicode);
6649 kind = PyUnicode_KIND(unicode);
6650 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651 /* allocate enough for a simple encoding without
6652 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006653 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006654 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006655
6656 _PyBytesWriter_Init(&writer);
6657 str = _PyBytesWriter_Alloc(&writer, size);
6658 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006659 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006660
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006661 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006662 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006663
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006665 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006667 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006668 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006669 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006671 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006673 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006674 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006675 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006676
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006677 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006679
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006680 /* Only overallocate the buffer if it's not the last write */
6681 writer.overallocate = (collend < size);
6682
Benjamin Peterson29060642009-01-31 22:14:21 +00006683 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006684 if (error_handler == _Py_ERROR_UNKNOWN)
6685 error_handler = get_error_handler(errors);
6686
6687 switch (error_handler) {
6688 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006689 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006690 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006691
6692 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006693 memset(str, '?', collend - collstart);
6694 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006695 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006696 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006697 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 break;
Victor Stinner50149202015-09-22 00:26:54 +02006699
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006700 case _Py_ERROR_BACKSLASHREPLACE:
Victor Stinnerad771582015-10-09 12:38:53 +02006701 /* substract preallocated bytes */
6702 writer.min_size -= (collend - collstart);
6703 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006704 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006705 if (str == NULL)
6706 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006707 pos = collend;
6708 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006709
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006710 case _Py_ERROR_XMLCHARREFREPLACE:
Victor Stinnerad771582015-10-09 12:38:53 +02006711 /* substract preallocated bytes */
6712 writer.min_size -= (collend - collstart);
6713 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006714 unicode, collstart, collend);
6715 if (str == NULL)
6716 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006717 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 break;
Victor Stinner50149202015-09-22 00:26:54 +02006719
Victor Stinnerc3713e92015-09-29 12:32:13 +02006720 case _Py_ERROR_SURROGATEESCAPE:
6721 for (i = collstart; i < collend; ++i) {
6722 ch = PyUnicode_READ(kind, data, i);
6723 if (ch < 0xdc80 || 0xdcff < ch) {
6724 /* Not a UTF-8b surrogate */
6725 break;
6726 }
6727 *str++ = (char)(ch - 0xdc00);
6728 ++pos;
6729 }
6730 if (i >= collend)
6731 break;
6732 collstart = pos;
6733 assert(collstart != collend);
6734 /* fallback to general error handling */
6735
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006737 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6738 encoding, reason, unicode, &exc,
6739 collstart, collend, &newpos);
6740 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006742
Victor Stinnerad771582015-10-09 12:38:53 +02006743 /* substract preallocated bytes */
6744 writer.min_size -= 1;
6745
Victor Stinner6bd525b2015-10-09 13:10:05 +02006746 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006747 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006748 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006749 PyBytes_AS_STRING(rep),
6750 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006751 if (str == NULL)
6752 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006753 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006754 else {
6755 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006756
Victor Stinner6bd525b2015-10-09 13:10:05 +02006757 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006758 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006759
6760 if (PyUnicode_IS_ASCII(rep)) {
6761 /* Fast path: all characters are smaller than limit */
6762 assert(limit >= 128);
6763 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6764 str = _PyBytesWriter_WriteBytes(&writer, str,
6765 PyUnicode_DATA(rep),
6766 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006768 else {
6769 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6770
6771 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6772 if (str == NULL)
6773 goto onError;
6774
6775 /* check if there is anything unencodable in the
6776 replacement and copy it to the output */
6777 for (i = 0; repsize-->0; ++i, ++str) {
6778 ch = PyUnicode_READ_CHAR(rep, i);
6779 if (ch >= limit) {
6780 raise_encode_exception(&exc, encoding, unicode,
6781 pos, pos+1, reason);
6782 goto onError;
6783 }
6784 *str = (char)ch;
6785 }
6786 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006787 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006788 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006789 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006790 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006791
6792 /* If overallocation was disabled, ensure that it was the last
6793 write. Otherwise, we missed an optimization */
6794 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006795 }
6796 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006797
Victor Stinner50149202015-09-22 00:26:54 +02006798 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006799 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006800 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006801
6802 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006803 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006804 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006805 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006806 Py_XDECREF(exc);
6807 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006808}
6809
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006810/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006811PyObject *
6812PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006813 Py_ssize_t size,
6814 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006816 PyObject *result;
6817 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6818 if (unicode == NULL)
6819 return NULL;
6820 result = unicode_encode_ucs1(unicode, errors, 256);
6821 Py_DECREF(unicode);
6822 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823}
6824
Alexander Belopolsky40018472011-02-26 01:02:56 +00006825PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006826_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827{
6828 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 PyErr_BadArgument();
6830 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006832 if (PyUnicode_READY(unicode) == -1)
6833 return NULL;
6834 /* Fast path: if it is a one-byte string, construct
6835 bytes object directly. */
6836 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6837 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6838 PyUnicode_GET_LENGTH(unicode));
6839 /* Non-Latin-1 characters present. Defer to above function to
6840 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006841 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006842}
6843
6844PyObject*
6845PyUnicode_AsLatin1String(PyObject *unicode)
6846{
6847 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848}
6849
6850/* --- 7-bit ASCII Codec -------------------------------------------------- */
6851
Alexander Belopolsky40018472011-02-26 01:02:56 +00006852PyObject *
6853PyUnicode_DecodeASCII(const char *s,
6854 Py_ssize_t size,
6855 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006857 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006858 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006859 int kind;
6860 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006861 Py_ssize_t startinpos;
6862 Py_ssize_t endinpos;
6863 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006864 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006865 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006866 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006867 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006868
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006870 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006871
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006873 if (size == 1 && (unsigned char)s[0] < 128)
6874 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006875
Victor Stinner8f674cc2013-04-17 23:02:17 +02006876 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006877 writer.min_length = size;
6878 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006879 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006880
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006881 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006882 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006883 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006884 writer.pos = outpos;
6885 if (writer.pos == size)
6886 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006887
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006888 s += writer.pos;
6889 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006890 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006891 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006893 PyUnicode_WRITE(kind, data, writer.pos, c);
6894 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006896 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006897 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006898
6899 /* byte outsize range 0x00..0x7f: call the error handler */
6900
6901 if (error_handler == _Py_ERROR_UNKNOWN)
6902 error_handler = get_error_handler(errors);
6903
6904 switch (error_handler)
6905 {
6906 case _Py_ERROR_REPLACE:
6907 case _Py_ERROR_SURROGATEESCAPE:
6908 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006909 but we may switch to UCS2 at the first write */
6910 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6911 goto onError;
6912 kind = writer.kind;
6913 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006914
6915 if (error_handler == _Py_ERROR_REPLACE)
6916 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6917 else
6918 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6919 writer.pos++;
6920 ++s;
6921 break;
6922
6923 case _Py_ERROR_IGNORE:
6924 ++s;
6925 break;
6926
6927 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 startinpos = s-starts;
6929 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006930 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006931 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 "ascii", "ordinal not in range(128)",
6933 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006934 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006935 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006936 kind = writer.kind;
6937 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006940 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006941 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006942 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006943
Benjamin Peterson29060642009-01-31 22:14:21 +00006944 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006945 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006946 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006947 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948 return NULL;
6949}
6950
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006951/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006952PyObject *
6953PyUnicode_EncodeASCII(const Py_UNICODE *p,
6954 Py_ssize_t size,
6955 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006957 PyObject *result;
6958 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6959 if (unicode == NULL)
6960 return NULL;
6961 result = unicode_encode_ucs1(unicode, errors, 128);
6962 Py_DECREF(unicode);
6963 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964}
6965
Alexander Belopolsky40018472011-02-26 01:02:56 +00006966PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006967_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968{
6969 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 PyErr_BadArgument();
6971 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006973 if (PyUnicode_READY(unicode) == -1)
6974 return NULL;
6975 /* Fast path: if it is an ASCII-only string, construct bytes object
6976 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006977 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006978 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6979 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006980 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006981}
6982
6983PyObject *
6984PyUnicode_AsASCIIString(PyObject *unicode)
6985{
6986 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987}
6988
Victor Stinner99b95382011-07-04 14:23:54 +02006989#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006990
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006991/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006992
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006993#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006994#define NEED_RETRY
6995#endif
6996
Victor Stinner3a50e702011-10-18 21:21:00 +02006997#ifndef WC_ERR_INVALID_CHARS
6998# define WC_ERR_INVALID_CHARS 0x0080
6999#endif
7000
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007001static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007002code_page_name(UINT code_page, PyObject **obj)
7003{
7004 *obj = NULL;
7005 if (code_page == CP_ACP)
7006 return "mbcs";
7007 if (code_page == CP_UTF7)
7008 return "CP_UTF7";
7009 if (code_page == CP_UTF8)
7010 return "CP_UTF8";
7011
7012 *obj = PyBytes_FromFormat("cp%u", code_page);
7013 if (*obj == NULL)
7014 return NULL;
7015 return PyBytes_AS_STRING(*obj);
7016}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007017
Victor Stinner3a50e702011-10-18 21:21:00 +02007018static DWORD
7019decode_code_page_flags(UINT code_page)
7020{
7021 if (code_page == CP_UTF7) {
7022 /* The CP_UTF7 decoder only supports flags=0 */
7023 return 0;
7024 }
7025 else
7026 return MB_ERR_INVALID_CHARS;
7027}
7028
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007029/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007030 * Decode a byte string from a Windows code page into unicode object in strict
7031 * mode.
7032 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007033 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7034 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007035 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007036static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007037decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007038 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007039 const char *in,
7040 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007041{
Victor Stinner3a50e702011-10-18 21:21:00 +02007042 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007043 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007044 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007045
7046 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007047 assert(insize > 0);
7048 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7049 if (outsize <= 0)
7050 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007051
7052 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007053 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007054 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007055 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007056 if (*v == NULL)
7057 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007058 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007059 }
7060 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007061 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007062 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007063 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007064 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007065 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007066 }
7067
7068 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007069 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7070 if (outsize <= 0)
7071 goto error;
7072 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007073
Victor Stinner3a50e702011-10-18 21:21:00 +02007074error:
7075 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7076 return -2;
7077 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007078 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007079}
7080
Victor Stinner3a50e702011-10-18 21:21:00 +02007081/*
7082 * Decode a byte string from a code page into unicode object with an error
7083 * handler.
7084 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007085 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007086 * UnicodeDecodeError exception and returns -1 on error.
7087 */
7088static int
7089decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007090 PyObject **v,
7091 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007092 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007093{
7094 const char *startin = in;
7095 const char *endin = in + size;
7096 const DWORD flags = decode_code_page_flags(code_page);
7097 /* Ideally, we should get reason from FormatMessage. This is the Windows
7098 2000 English version of the message. */
7099 const char *reason = "No mapping for the Unicode character exists "
7100 "in the target code page.";
7101 /* each step cannot decode more than 1 character, but a character can be
7102 represented as a surrogate pair */
7103 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007104 int insize;
7105 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007106 PyObject *errorHandler = NULL;
7107 PyObject *exc = NULL;
7108 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007109 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007110 DWORD err;
7111 int ret = -1;
7112
7113 assert(size > 0);
7114
7115 encoding = code_page_name(code_page, &encoding_obj);
7116 if (encoding == NULL)
7117 return -1;
7118
Victor Stinner7d00cc12014-03-17 23:08:06 +01007119 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007120 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7121 UnicodeDecodeError. */
7122 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7123 if (exc != NULL) {
7124 PyCodec_StrictErrors(exc);
7125 Py_CLEAR(exc);
7126 }
7127 goto error;
7128 }
7129
7130 if (*v == NULL) {
7131 /* Create unicode object */
7132 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7133 PyErr_NoMemory();
7134 goto error;
7135 }
Victor Stinnerab595942011-12-17 04:59:06 +01007136 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007137 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 if (*v == NULL)
7139 goto error;
7140 startout = PyUnicode_AS_UNICODE(*v);
7141 }
7142 else {
7143 /* Extend unicode object */
7144 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7145 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7146 PyErr_NoMemory();
7147 goto error;
7148 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007149 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 goto error;
7151 startout = PyUnicode_AS_UNICODE(*v) + n;
7152 }
7153
7154 /* Decode the byte string character per character */
7155 out = startout;
7156 while (in < endin)
7157 {
7158 /* Decode a character */
7159 insize = 1;
7160 do
7161 {
7162 outsize = MultiByteToWideChar(code_page, flags,
7163 in, insize,
7164 buffer, Py_ARRAY_LENGTH(buffer));
7165 if (outsize > 0)
7166 break;
7167 err = GetLastError();
7168 if (err != ERROR_NO_UNICODE_TRANSLATION
7169 && err != ERROR_INSUFFICIENT_BUFFER)
7170 {
7171 PyErr_SetFromWindowsErr(0);
7172 goto error;
7173 }
7174 insize++;
7175 }
7176 /* 4=maximum length of a UTF-8 sequence */
7177 while (insize <= 4 && (in + insize) <= endin);
7178
7179 if (outsize <= 0) {
7180 Py_ssize_t startinpos, endinpos, outpos;
7181
Victor Stinner7d00cc12014-03-17 23:08:06 +01007182 /* last character in partial decode? */
7183 if (in + insize >= endin && !final)
7184 break;
7185
Victor Stinner3a50e702011-10-18 21:21:00 +02007186 startinpos = in - startin;
7187 endinpos = startinpos + 1;
7188 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007189 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007190 errors, &errorHandler,
7191 encoding, reason,
7192 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007193 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 {
7195 goto error;
7196 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007197 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 }
7199 else {
7200 in += insize;
7201 memcpy(out, buffer, outsize * sizeof(wchar_t));
7202 out += outsize;
7203 }
7204 }
7205
7206 /* write a NUL character at the end */
7207 *out = 0;
7208
7209 /* Extend unicode object */
7210 outsize = out - startout;
7211 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007212 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007213 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007214 /* (in - startin) <= size and size is an int */
7215 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007216
7217error:
7218 Py_XDECREF(encoding_obj);
7219 Py_XDECREF(errorHandler);
7220 Py_XDECREF(exc);
7221 return ret;
7222}
7223
Victor Stinner3a50e702011-10-18 21:21:00 +02007224static PyObject *
7225decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007226 const char *s, Py_ssize_t size,
7227 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007228{
Victor Stinner76a31a62011-11-04 00:05:13 +01007229 PyObject *v = NULL;
7230 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007231
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 if (code_page < 0) {
7233 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7234 return NULL;
7235 }
7236
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007237 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007239
Victor Stinner76a31a62011-11-04 00:05:13 +01007240 do
7241 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007242#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007243 if (size > INT_MAX) {
7244 chunk_size = INT_MAX;
7245 final = 0;
7246 done = 0;
7247 }
7248 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007249#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007250 {
7251 chunk_size = (int)size;
7252 final = (consumed == NULL);
7253 done = 1;
7254 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007255
Victor Stinner76a31a62011-11-04 00:05:13 +01007256 if (chunk_size == 0 && done) {
7257 if (v != NULL)
7258 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007259 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007260 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007261
Victor Stinner76a31a62011-11-04 00:05:13 +01007262 converted = decode_code_page_strict(code_page, &v,
7263 s, chunk_size);
7264 if (converted == -2)
7265 converted = decode_code_page_errors(code_page, &v,
7266 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007267 errors, final);
7268 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007269
7270 if (converted < 0) {
7271 Py_XDECREF(v);
7272 return NULL;
7273 }
7274
7275 if (consumed)
7276 *consumed += converted;
7277
7278 s += converted;
7279 size -= converted;
7280 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007281
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007282 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007283}
7284
Alexander Belopolsky40018472011-02-26 01:02:56 +00007285PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007286PyUnicode_DecodeCodePageStateful(int code_page,
7287 const char *s,
7288 Py_ssize_t size,
7289 const char *errors,
7290 Py_ssize_t *consumed)
7291{
7292 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7293}
7294
7295PyObject *
7296PyUnicode_DecodeMBCSStateful(const char *s,
7297 Py_ssize_t size,
7298 const char *errors,
7299 Py_ssize_t *consumed)
7300{
7301 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7302}
7303
7304PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007305PyUnicode_DecodeMBCS(const char *s,
7306 Py_ssize_t size,
7307 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007308{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007309 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7310}
7311
Victor Stinner3a50e702011-10-18 21:21:00 +02007312static DWORD
7313encode_code_page_flags(UINT code_page, const char *errors)
7314{
7315 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007316 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007317 }
7318 else if (code_page == CP_UTF7) {
7319 /* CP_UTF7 only supports flags=0 */
7320 return 0;
7321 }
7322 else {
7323 if (errors != NULL && strcmp(errors, "replace") == 0)
7324 return 0;
7325 else
7326 return WC_NO_BEST_FIT_CHARS;
7327 }
7328}
7329
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007330/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007331 * Encode a Unicode string to a Windows code page into a byte string in strict
7332 * mode.
7333 *
7334 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007335 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007336 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007337static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007338encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007339 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007340 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007341{
Victor Stinner554f3f02010-06-16 23:33:54 +00007342 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007343 BOOL *pusedDefaultChar = &usedDefaultChar;
7344 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007345 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007346 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007347 const DWORD flags = encode_code_page_flags(code_page, NULL);
7348 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007349 /* Create a substring so that we can get the UTF-16 representation
7350 of just the slice under consideration. */
7351 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007352
Martin v. Löwis3d325192011-11-04 18:23:06 +01007353 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007354
Victor Stinner3a50e702011-10-18 21:21:00 +02007355 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007356 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007358 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007359
Victor Stinner2fc507f2011-11-04 20:06:39 +01007360 substring = PyUnicode_Substring(unicode, offset, offset+len);
7361 if (substring == NULL)
7362 return -1;
7363 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7364 if (p == NULL) {
7365 Py_DECREF(substring);
7366 return -1;
7367 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007368 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007369
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007370 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007371 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007372 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007373 NULL, 0,
7374 NULL, pusedDefaultChar);
7375 if (outsize <= 0)
7376 goto error;
7377 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007378 if (pusedDefaultChar && *pusedDefaultChar) {
7379 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007380 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007381 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007382
Victor Stinner3a50e702011-10-18 21:21:00 +02007383 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007385 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007386 if (*outbytes == NULL) {
7387 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007388 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007389 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007390 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007391 }
7392 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007394 const Py_ssize_t n = PyBytes_Size(*outbytes);
7395 if (outsize > PY_SSIZE_T_MAX - n) {
7396 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007397 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007400 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7401 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007402 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007403 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007404 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007405 }
7406
7407 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007408 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007409 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007410 out, outsize,
7411 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007412 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007413 if (outsize <= 0)
7414 goto error;
7415 if (pusedDefaultChar && *pusedDefaultChar)
7416 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007417 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007418
Victor Stinner3a50e702011-10-18 21:21:00 +02007419error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007420 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007421 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7422 return -2;
7423 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007424 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007425}
7426
Victor Stinner3a50e702011-10-18 21:21:00 +02007427/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007428 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 * error handler.
7430 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007431 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 * -1 on other error.
7433 */
7434static int
7435encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007436 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007437 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007438{
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007440 Py_ssize_t pos = unicode_offset;
7441 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007442 /* Ideally, we should get reason from FormatMessage. This is the Windows
7443 2000 English version of the message. */
7444 const char *reason = "invalid character";
7445 /* 4=maximum length of a UTF-8 sequence */
7446 char buffer[4];
7447 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7448 Py_ssize_t outsize;
7449 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007450 PyObject *errorHandler = NULL;
7451 PyObject *exc = NULL;
7452 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007453 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007454 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 PyObject *rep;
7456 int ret = -1;
7457
7458 assert(insize > 0);
7459
7460 encoding = code_page_name(code_page, &encoding_obj);
7461 if (encoding == NULL)
7462 return -1;
7463
7464 if (errors == NULL || strcmp(errors, "strict") == 0) {
7465 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7466 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007467 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007468 if (exc != NULL) {
7469 PyCodec_StrictErrors(exc);
7470 Py_DECREF(exc);
7471 }
7472 Py_XDECREF(encoding_obj);
7473 return -1;
7474 }
7475
7476 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7477 pusedDefaultChar = &usedDefaultChar;
7478 else
7479 pusedDefaultChar = NULL;
7480
7481 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7482 PyErr_NoMemory();
7483 goto error;
7484 }
7485 outsize = insize * Py_ARRAY_LENGTH(buffer);
7486
7487 if (*outbytes == NULL) {
7488 /* Create string object */
7489 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7490 if (*outbytes == NULL)
7491 goto error;
7492 out = PyBytes_AS_STRING(*outbytes);
7493 }
7494 else {
7495 /* Extend string object */
7496 Py_ssize_t n = PyBytes_Size(*outbytes);
7497 if (n > PY_SSIZE_T_MAX - outsize) {
7498 PyErr_NoMemory();
7499 goto error;
7500 }
7501 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7502 goto error;
7503 out = PyBytes_AS_STRING(*outbytes) + n;
7504 }
7505
7506 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007507 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007508 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007509 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7510 wchar_t chars[2];
7511 int charsize;
7512 if (ch < 0x10000) {
7513 chars[0] = (wchar_t)ch;
7514 charsize = 1;
7515 }
7516 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007517 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7518 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007519 charsize = 2;
7520 }
7521
Victor Stinner3a50e702011-10-18 21:21:00 +02007522 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007523 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007524 buffer, Py_ARRAY_LENGTH(buffer),
7525 NULL, pusedDefaultChar);
7526 if (outsize > 0) {
7527 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7528 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007529 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007530 memcpy(out, buffer, outsize);
7531 out += outsize;
7532 continue;
7533 }
7534 }
7535 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7536 PyErr_SetFromWindowsErr(0);
7537 goto error;
7538 }
7539
Victor Stinner3a50e702011-10-18 21:21:00 +02007540 rep = unicode_encode_call_errorhandler(
7541 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007542 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007543 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007544 if (rep == NULL)
7545 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007546 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007547
7548 if (PyBytes_Check(rep)) {
7549 outsize = PyBytes_GET_SIZE(rep);
7550 if (outsize != 1) {
7551 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7552 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7553 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7554 Py_DECREF(rep);
7555 goto error;
7556 }
7557 out = PyBytes_AS_STRING(*outbytes) + offset;
7558 }
7559 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7560 out += outsize;
7561 }
7562 else {
7563 Py_ssize_t i;
7564 enum PyUnicode_Kind kind;
7565 void *data;
7566
Benjamin Petersonbac79492012-01-14 13:34:47 -05007567 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007568 Py_DECREF(rep);
7569 goto error;
7570 }
7571
7572 outsize = PyUnicode_GET_LENGTH(rep);
7573 if (outsize != 1) {
7574 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7575 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7576 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7577 Py_DECREF(rep);
7578 goto error;
7579 }
7580 out = PyBytes_AS_STRING(*outbytes) + offset;
7581 }
7582 kind = PyUnicode_KIND(rep);
7583 data = PyUnicode_DATA(rep);
7584 for (i=0; i < outsize; i++) {
7585 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7586 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007587 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007588 encoding, unicode,
7589 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007590 "unable to encode error handler result to ASCII");
7591 Py_DECREF(rep);
7592 goto error;
7593 }
7594 *out = (unsigned char)ch;
7595 out++;
7596 }
7597 }
7598 Py_DECREF(rep);
7599 }
7600 /* write a NUL byte */
7601 *out = 0;
7602 outsize = out - PyBytes_AS_STRING(*outbytes);
7603 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7604 if (_PyBytes_Resize(outbytes, outsize) < 0)
7605 goto error;
7606 ret = 0;
7607
7608error:
7609 Py_XDECREF(encoding_obj);
7610 Py_XDECREF(errorHandler);
7611 Py_XDECREF(exc);
7612 return ret;
7613}
7614
Victor Stinner3a50e702011-10-18 21:21:00 +02007615static PyObject *
7616encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007617 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007618 const char *errors)
7619{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007620 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007622 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007623 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007624
Victor Stinner29dacf22015-01-26 16:41:32 +01007625 if (!PyUnicode_Check(unicode)) {
7626 PyErr_BadArgument();
7627 return NULL;
7628 }
7629
Benjamin Petersonbac79492012-01-14 13:34:47 -05007630 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007631 return NULL;
7632 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007633
Victor Stinner3a50e702011-10-18 21:21:00 +02007634 if (code_page < 0) {
7635 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7636 return NULL;
7637 }
7638
Martin v. Löwis3d325192011-11-04 18:23:06 +01007639 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007640 return PyBytes_FromStringAndSize(NULL, 0);
7641
Victor Stinner7581cef2011-11-03 22:32:33 +01007642 offset = 0;
7643 do
7644 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007645#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007646 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007647 chunks. */
7648 if (len > INT_MAX/2) {
7649 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007650 done = 0;
7651 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007652 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007653#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007654 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007655 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007656 done = 1;
7657 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007658
Victor Stinner76a31a62011-11-04 00:05:13 +01007659 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007660 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007661 errors);
7662 if (ret == -2)
7663 ret = encode_code_page_errors(code_page, &outbytes,
7664 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007665 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007666 if (ret < 0) {
7667 Py_XDECREF(outbytes);
7668 return NULL;
7669 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007670
Victor Stinner7581cef2011-11-03 22:32:33 +01007671 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007672 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007673 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007674
Victor Stinner3a50e702011-10-18 21:21:00 +02007675 return outbytes;
7676}
7677
7678PyObject *
7679PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7680 Py_ssize_t size,
7681 const char *errors)
7682{
Victor Stinner7581cef2011-11-03 22:32:33 +01007683 PyObject *unicode, *res;
7684 unicode = PyUnicode_FromUnicode(p, size);
7685 if (unicode == NULL)
7686 return NULL;
7687 res = encode_code_page(CP_ACP, unicode, errors);
7688 Py_DECREF(unicode);
7689 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007690}
7691
7692PyObject *
7693PyUnicode_EncodeCodePage(int code_page,
7694 PyObject *unicode,
7695 const char *errors)
7696{
Victor Stinner7581cef2011-11-03 22:32:33 +01007697 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007698}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007699
Alexander Belopolsky40018472011-02-26 01:02:56 +00007700PyObject *
7701PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007702{
Victor Stinner7581cef2011-11-03 22:32:33 +01007703 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007704}
7705
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007706#undef NEED_RETRY
7707
Victor Stinner99b95382011-07-04 14:23:54 +02007708#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007709
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710/* --- Character Mapping Codec -------------------------------------------- */
7711
Victor Stinnerfb161b12013-04-18 01:44:27 +02007712static int
7713charmap_decode_string(const char *s,
7714 Py_ssize_t size,
7715 PyObject *mapping,
7716 const char *errors,
7717 _PyUnicodeWriter *writer)
7718{
7719 const char *starts = s;
7720 const char *e;
7721 Py_ssize_t startinpos, endinpos;
7722 PyObject *errorHandler = NULL, *exc = NULL;
7723 Py_ssize_t maplen;
7724 enum PyUnicode_Kind mapkind;
7725 void *mapdata;
7726 Py_UCS4 x;
7727 unsigned char ch;
7728
7729 if (PyUnicode_READY(mapping) == -1)
7730 return -1;
7731
7732 maplen = PyUnicode_GET_LENGTH(mapping);
7733 mapdata = PyUnicode_DATA(mapping);
7734 mapkind = PyUnicode_KIND(mapping);
7735
7736 e = s + size;
7737
7738 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7739 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7740 * is disabled in encoding aliases, latin1 is preferred because
7741 * its implementation is faster. */
7742 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7743 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7744 Py_UCS4 maxchar = writer->maxchar;
7745
7746 assert (writer->kind == PyUnicode_1BYTE_KIND);
7747 while (s < e) {
7748 ch = *s;
7749 x = mapdata_ucs1[ch];
7750 if (x > maxchar) {
7751 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7752 goto onError;
7753 maxchar = writer->maxchar;
7754 outdata = (Py_UCS1 *)writer->data;
7755 }
7756 outdata[writer->pos] = x;
7757 writer->pos++;
7758 ++s;
7759 }
7760 return 0;
7761 }
7762
7763 while (s < e) {
7764 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7765 enum PyUnicode_Kind outkind = writer->kind;
7766 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7767 if (outkind == PyUnicode_1BYTE_KIND) {
7768 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7769 Py_UCS4 maxchar = writer->maxchar;
7770 while (s < e) {
7771 ch = *s;
7772 x = mapdata_ucs2[ch];
7773 if (x > maxchar)
7774 goto Error;
7775 outdata[writer->pos] = x;
7776 writer->pos++;
7777 ++s;
7778 }
7779 break;
7780 }
7781 else if (outkind == PyUnicode_2BYTE_KIND) {
7782 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7783 while (s < e) {
7784 ch = *s;
7785 x = mapdata_ucs2[ch];
7786 if (x == 0xFFFE)
7787 goto Error;
7788 outdata[writer->pos] = x;
7789 writer->pos++;
7790 ++s;
7791 }
7792 break;
7793 }
7794 }
7795 ch = *s;
7796
7797 if (ch < maplen)
7798 x = PyUnicode_READ(mapkind, mapdata, ch);
7799 else
7800 x = 0xfffe; /* invalid value */
7801Error:
7802 if (x == 0xfffe)
7803 {
7804 /* undefined mapping */
7805 startinpos = s-starts;
7806 endinpos = startinpos+1;
7807 if (unicode_decode_call_errorhandler_writer(
7808 errors, &errorHandler,
7809 "charmap", "character maps to <undefined>",
7810 &starts, &e, &startinpos, &endinpos, &exc, &s,
7811 writer)) {
7812 goto onError;
7813 }
7814 continue;
7815 }
7816
7817 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7818 goto onError;
7819 ++s;
7820 }
7821 Py_XDECREF(errorHandler);
7822 Py_XDECREF(exc);
7823 return 0;
7824
7825onError:
7826 Py_XDECREF(errorHandler);
7827 Py_XDECREF(exc);
7828 return -1;
7829}
7830
7831static int
7832charmap_decode_mapping(const char *s,
7833 Py_ssize_t size,
7834 PyObject *mapping,
7835 const char *errors,
7836 _PyUnicodeWriter *writer)
7837{
7838 const char *starts = s;
7839 const char *e;
7840 Py_ssize_t startinpos, endinpos;
7841 PyObject *errorHandler = NULL, *exc = NULL;
7842 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007843 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007844
7845 e = s + size;
7846
7847 while (s < e) {
7848 ch = *s;
7849
7850 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7851 key = PyLong_FromLong((long)ch);
7852 if (key == NULL)
7853 goto onError;
7854
7855 item = PyObject_GetItem(mapping, key);
7856 Py_DECREF(key);
7857 if (item == NULL) {
7858 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7859 /* No mapping found means: mapping is undefined. */
7860 PyErr_Clear();
7861 goto Undefined;
7862 } else
7863 goto onError;
7864 }
7865
7866 /* Apply mapping */
7867 if (item == Py_None)
7868 goto Undefined;
7869 if (PyLong_Check(item)) {
7870 long value = PyLong_AS_LONG(item);
7871 if (value == 0xFFFE)
7872 goto Undefined;
7873 if (value < 0 || value > MAX_UNICODE) {
7874 PyErr_Format(PyExc_TypeError,
7875 "character mapping must be in range(0x%lx)",
7876 (unsigned long)MAX_UNICODE + 1);
7877 goto onError;
7878 }
7879
7880 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7881 goto onError;
7882 }
7883 else if (PyUnicode_Check(item)) {
7884 if (PyUnicode_READY(item) == -1)
7885 goto onError;
7886 if (PyUnicode_GET_LENGTH(item) == 1) {
7887 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7888 if (value == 0xFFFE)
7889 goto Undefined;
7890 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7891 goto onError;
7892 }
7893 else {
7894 writer->overallocate = 1;
7895 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7896 goto onError;
7897 }
7898 }
7899 else {
7900 /* wrong return value */
7901 PyErr_SetString(PyExc_TypeError,
7902 "character mapping must return integer, None or str");
7903 goto onError;
7904 }
7905 Py_CLEAR(item);
7906 ++s;
7907 continue;
7908
7909Undefined:
7910 /* undefined mapping */
7911 Py_CLEAR(item);
7912 startinpos = s-starts;
7913 endinpos = startinpos+1;
7914 if (unicode_decode_call_errorhandler_writer(
7915 errors, &errorHandler,
7916 "charmap", "character maps to <undefined>",
7917 &starts, &e, &startinpos, &endinpos, &exc, &s,
7918 writer)) {
7919 goto onError;
7920 }
7921 }
7922 Py_XDECREF(errorHandler);
7923 Py_XDECREF(exc);
7924 return 0;
7925
7926onError:
7927 Py_XDECREF(item);
7928 Py_XDECREF(errorHandler);
7929 Py_XDECREF(exc);
7930 return -1;
7931}
7932
Alexander Belopolsky40018472011-02-26 01:02:56 +00007933PyObject *
7934PyUnicode_DecodeCharmap(const char *s,
7935 Py_ssize_t size,
7936 PyObject *mapping,
7937 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007939 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007940
Guido van Rossumd57fd912000-03-10 22:53:23 +00007941 /* Default to Latin-1 */
7942 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007943 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944
Guido van Rossumd57fd912000-03-10 22:53:23 +00007945 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007946 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007947 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007948 writer.min_length = size;
7949 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007951
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007952 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007953 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7954 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007955 }
7956 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007957 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7958 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007960 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007961
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007963 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964 return NULL;
7965}
7966
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007967/* Charmap encoding: the lookup table */
7968
Alexander Belopolsky40018472011-02-26 01:02:56 +00007969struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007970 PyObject_HEAD
7971 unsigned char level1[32];
7972 int count2, count3;
7973 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007974};
7975
7976static PyObject*
7977encoding_map_size(PyObject *obj, PyObject* args)
7978{
7979 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007980 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007982}
7983
7984static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007985 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 PyDoc_STR("Return the size (in bytes) of this object") },
7987 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007988};
7989
7990static void
7991encoding_map_dealloc(PyObject* o)
7992{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007993 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007994}
7995
7996static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007997 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 "EncodingMap", /*tp_name*/
7999 sizeof(struct encoding_map), /*tp_basicsize*/
8000 0, /*tp_itemsize*/
8001 /* methods */
8002 encoding_map_dealloc, /*tp_dealloc*/
8003 0, /*tp_print*/
8004 0, /*tp_getattr*/
8005 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008006 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 0, /*tp_repr*/
8008 0, /*tp_as_number*/
8009 0, /*tp_as_sequence*/
8010 0, /*tp_as_mapping*/
8011 0, /*tp_hash*/
8012 0, /*tp_call*/
8013 0, /*tp_str*/
8014 0, /*tp_getattro*/
8015 0, /*tp_setattro*/
8016 0, /*tp_as_buffer*/
8017 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8018 0, /*tp_doc*/
8019 0, /*tp_traverse*/
8020 0, /*tp_clear*/
8021 0, /*tp_richcompare*/
8022 0, /*tp_weaklistoffset*/
8023 0, /*tp_iter*/
8024 0, /*tp_iternext*/
8025 encoding_map_methods, /*tp_methods*/
8026 0, /*tp_members*/
8027 0, /*tp_getset*/
8028 0, /*tp_base*/
8029 0, /*tp_dict*/
8030 0, /*tp_descr_get*/
8031 0, /*tp_descr_set*/
8032 0, /*tp_dictoffset*/
8033 0, /*tp_init*/
8034 0, /*tp_alloc*/
8035 0, /*tp_new*/
8036 0, /*tp_free*/
8037 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008038};
8039
8040PyObject*
8041PyUnicode_BuildEncodingMap(PyObject* string)
8042{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008043 PyObject *result;
8044 struct encoding_map *mresult;
8045 int i;
8046 int need_dict = 0;
8047 unsigned char level1[32];
8048 unsigned char level2[512];
8049 unsigned char *mlevel1, *mlevel2, *mlevel3;
8050 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008051 int kind;
8052 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008053 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008054 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008055
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008056 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008057 PyErr_BadArgument();
8058 return NULL;
8059 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008060 kind = PyUnicode_KIND(string);
8061 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008062 length = PyUnicode_GET_LENGTH(string);
8063 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008064 memset(level1, 0xFF, sizeof level1);
8065 memset(level2, 0xFF, sizeof level2);
8066
8067 /* If there isn't a one-to-one mapping of NULL to \0,
8068 or if there are non-BMP characters, we need to use
8069 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008070 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008071 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008072 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008073 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008074 ch = PyUnicode_READ(kind, data, i);
8075 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008076 need_dict = 1;
8077 break;
8078 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008079 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008080 /* unmapped character */
8081 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008082 l1 = ch >> 11;
8083 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008084 if (level1[l1] == 0xFF)
8085 level1[l1] = count2++;
8086 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008087 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008088 }
8089
8090 if (count2 >= 0xFF || count3 >= 0xFF)
8091 need_dict = 1;
8092
8093 if (need_dict) {
8094 PyObject *result = PyDict_New();
8095 PyObject *key, *value;
8096 if (!result)
8097 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008098 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008099 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008100 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008101 if (!key || !value)
8102 goto failed1;
8103 if (PyDict_SetItem(result, key, value) == -1)
8104 goto failed1;
8105 Py_DECREF(key);
8106 Py_DECREF(value);
8107 }
8108 return result;
8109 failed1:
8110 Py_XDECREF(key);
8111 Py_XDECREF(value);
8112 Py_DECREF(result);
8113 return NULL;
8114 }
8115
8116 /* Create a three-level trie */
8117 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8118 16*count2 + 128*count3 - 1);
8119 if (!result)
8120 return PyErr_NoMemory();
8121 PyObject_Init(result, &EncodingMapType);
8122 mresult = (struct encoding_map*)result;
8123 mresult->count2 = count2;
8124 mresult->count3 = count3;
8125 mlevel1 = mresult->level1;
8126 mlevel2 = mresult->level23;
8127 mlevel3 = mresult->level23 + 16*count2;
8128 memcpy(mlevel1, level1, 32);
8129 memset(mlevel2, 0xFF, 16*count2);
8130 memset(mlevel3, 0, 128*count3);
8131 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008132 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008133 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008134 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8135 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008136 /* unmapped character */
8137 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008138 o1 = ch>>11;
8139 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008140 i2 = 16*mlevel1[o1] + o2;
8141 if (mlevel2[i2] == 0xFF)
8142 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008143 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008144 i3 = 128*mlevel2[i2] + o3;
8145 mlevel3[i3] = i;
8146 }
8147 return result;
8148}
8149
8150static int
Victor Stinner22168992011-11-20 17:09:18 +01008151encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008152{
8153 struct encoding_map *map = (struct encoding_map*)mapping;
8154 int l1 = c>>11;
8155 int l2 = (c>>7) & 0xF;
8156 int l3 = c & 0x7F;
8157 int i;
8158
Victor Stinner22168992011-11-20 17:09:18 +01008159 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 if (c == 0)
8162 return 0;
8163 /* level 1*/
8164 i = map->level1[l1];
8165 if (i == 0xFF) {
8166 return -1;
8167 }
8168 /* level 2*/
8169 i = map->level23[16*i+l2];
8170 if (i == 0xFF) {
8171 return -1;
8172 }
8173 /* level 3 */
8174 i = map->level23[16*map->count2 + 128*i + l3];
8175 if (i == 0) {
8176 return -1;
8177 }
8178 return i;
8179}
8180
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008181/* Lookup the character ch in the mapping. If the character
8182 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008183 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008184static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008185charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186{
Christian Heimes217cfd12007-12-02 14:31:20 +00008187 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008188 PyObject *x;
8189
8190 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008192 x = PyObject_GetItem(mapping, w);
8193 Py_DECREF(w);
8194 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008195 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8196 /* No mapping found means: mapping is undefined. */
8197 PyErr_Clear();
8198 x = Py_None;
8199 Py_INCREF(x);
8200 return x;
8201 } else
8202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008204 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008205 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008206 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008207 long value = PyLong_AS_LONG(x);
8208 if (value < 0 || value > 255) {
8209 PyErr_SetString(PyExc_TypeError,
8210 "character mapping must be in range(256)");
8211 Py_DECREF(x);
8212 return NULL;
8213 }
8214 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008216 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 /* wrong return value */
8220 PyErr_Format(PyExc_TypeError,
8221 "character mapping must return integer, bytes or None, not %.400s",
8222 x->ob_type->tp_name);
8223 Py_DECREF(x);
8224 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225 }
8226}
8227
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008228static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008229charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008230{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008231 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8232 /* exponentially overallocate to minimize reallocations */
8233 if (requiredsize < 2*outsize)
8234 requiredsize = 2*outsize;
8235 if (_PyBytes_Resize(outobj, requiredsize))
8236 return -1;
8237 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008238}
8239
Benjamin Peterson14339b62009-01-31 16:36:08 +00008240typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008242} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008244 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008245 space is available. Return a new reference to the object that
8246 was put in the output buffer, or Py_None, if the mapping was undefined
8247 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008248 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008249static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008250charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008251 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008252{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008253 PyObject *rep;
8254 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008255 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008256
Christian Heimes90aa7642007-12-19 02:45:37 +00008257 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008258 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008260 if (res == -1)
8261 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 if (outsize<requiredsize)
8263 if (charmapencode_resize(outobj, outpos, requiredsize))
8264 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008265 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 outstart[(*outpos)++] = (char)res;
8267 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008268 }
8269
8270 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008271 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008273 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 Py_DECREF(rep);
8275 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008276 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 if (PyLong_Check(rep)) {
8278 Py_ssize_t requiredsize = *outpos+1;
8279 if (outsize<requiredsize)
8280 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8281 Py_DECREF(rep);
8282 return enc_EXCEPTION;
8283 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008284 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008286 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 else {
8288 const char *repchars = PyBytes_AS_STRING(rep);
8289 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8290 Py_ssize_t requiredsize = *outpos+repsize;
8291 if (outsize<requiredsize)
8292 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8293 Py_DECREF(rep);
8294 return enc_EXCEPTION;
8295 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008296 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 memcpy(outstart + *outpos, repchars, repsize);
8298 *outpos += repsize;
8299 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008301 Py_DECREF(rep);
8302 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008303}
8304
8305/* handle an error in PyUnicode_EncodeCharmap
8306 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008307static int
8308charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008309 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008310 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008311 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008312 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008313{
8314 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008315 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008316 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008317 enum PyUnicode_Kind kind;
8318 void *data;
8319 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008320 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008321 Py_ssize_t collstartpos = *inpos;
8322 Py_ssize_t collendpos = *inpos+1;
8323 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324 char *encoding = "charmap";
8325 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008326 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008327 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008328 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329
Benjamin Petersonbac79492012-01-14 13:34:47 -05008330 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008331 return -1;
8332 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333 /* find all unencodable characters */
8334 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008335 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008336 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008337 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008338 val = encoding_map_lookup(ch, mapping);
8339 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 break;
8341 ++collendpos;
8342 continue;
8343 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008344
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008345 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8346 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 if (rep==NULL)
8348 return -1;
8349 else if (rep!=Py_None) {
8350 Py_DECREF(rep);
8351 break;
8352 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008353 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008355 }
8356 /* cache callback name lookup
8357 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008358 if (*error_handler == _Py_ERROR_UNKNOWN)
8359 *error_handler = get_error_handler(errors);
8360
8361 switch (*error_handler) {
8362 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008363 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008364 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008365
8366 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008367 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 x = charmapencode_output('?', mapping, res, respos);
8369 if (x==enc_EXCEPTION) {
8370 return -1;
8371 }
8372 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008373 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 return -1;
8375 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008376 }
8377 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008378 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008379 *inpos = collendpos;
8380 break;
Victor Stinner50149202015-09-22 00:26:54 +02008381
8382 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008383 /* generate replacement (temporarily (mis)uses p) */
8384 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 char buffer[2+29+1+1];
8386 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008387 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 for (cp = buffer; *cp; ++cp) {
8389 x = charmapencode_output(*cp, mapping, res, respos);
8390 if (x==enc_EXCEPTION)
8391 return -1;
8392 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008393 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 return -1;
8395 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008396 }
8397 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008398 *inpos = collendpos;
8399 break;
Victor Stinner50149202015-09-22 00:26:54 +02008400
Benjamin Peterson14339b62009-01-31 16:36:08 +00008401 default:
Victor Stinner50149202015-09-22 00:26:54 +02008402 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008403 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008405 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008407 if (PyBytes_Check(repunicode)) {
8408 /* Directly copy bytes result to output. */
8409 Py_ssize_t outsize = PyBytes_Size(*res);
8410 Py_ssize_t requiredsize;
8411 repsize = PyBytes_Size(repunicode);
8412 requiredsize = *respos + repsize;
8413 if (requiredsize > outsize)
8414 /* Make room for all additional bytes. */
8415 if (charmapencode_resize(res, respos, requiredsize)) {
8416 Py_DECREF(repunicode);
8417 return -1;
8418 }
8419 memcpy(PyBytes_AsString(*res) + *respos,
8420 PyBytes_AsString(repunicode), repsize);
8421 *respos += repsize;
8422 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008423 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008424 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008425 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008426 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008427 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008428 Py_DECREF(repunicode);
8429 return -1;
8430 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008431 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008432 data = PyUnicode_DATA(repunicode);
8433 kind = PyUnicode_KIND(repunicode);
8434 for (index = 0; index < repsize; index++) {
8435 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8436 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008438 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 return -1;
8440 }
8441 else if (x==enc_FAILED) {
8442 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008443 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 return -1;
8445 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008446 }
8447 *inpos = newpos;
8448 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008449 }
8450 return 0;
8451}
8452
Alexander Belopolsky40018472011-02-26 01:02:56 +00008453PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008454_PyUnicode_EncodeCharmap(PyObject *unicode,
8455 PyObject *mapping,
8456 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008458 /* output object */
8459 PyObject *res = NULL;
8460 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008461 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008462 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008463 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008464 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008465 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008466 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008467 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008468 void *data;
8469 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470
Benjamin Petersonbac79492012-01-14 13:34:47 -05008471 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008472 return NULL;
8473 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008474 data = PyUnicode_DATA(unicode);
8475 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008476
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477 /* Default to Latin-1 */
8478 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008479 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008481 /* allocate enough for a simple encoding without
8482 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008483 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008484 if (res == NULL)
8485 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008486 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008490 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008492 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 if (x==enc_EXCEPTION) /* error */
8494 goto onError;
8495 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008496 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008498 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 &res, &respos)) {
8500 goto onError;
8501 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008502 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 else
8504 /* done with this character => adjust input position */
8505 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008508 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008509 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008510 if (_PyBytes_Resize(&res, respos) < 0)
8511 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008512
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008513 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008514 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 return res;
8516
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008518 Py_XDECREF(res);
8519 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008520 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521 return NULL;
8522}
8523
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008524/* Deprecated */
8525PyObject *
8526PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8527 Py_ssize_t size,
8528 PyObject *mapping,
8529 const char *errors)
8530{
8531 PyObject *result;
8532 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8533 if (unicode == NULL)
8534 return NULL;
8535 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8536 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008537 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008538}
8539
Alexander Belopolsky40018472011-02-26 01:02:56 +00008540PyObject *
8541PyUnicode_AsCharmapString(PyObject *unicode,
8542 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543{
8544 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 PyErr_BadArgument();
8546 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008548 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549}
8550
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008551/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008552static void
8553make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008555 Py_ssize_t startpos, Py_ssize_t endpos,
8556 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559 *exceptionObject = _PyUnicodeTranslateError_Create(
8560 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561 }
8562 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8564 goto onError;
8565 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8566 goto onError;
8567 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8568 goto onError;
8569 return;
8570 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008571 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572 }
8573}
8574
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008575/* error handling callback helper:
8576 build arguments, call the callback and check the arguments,
8577 put the result into newpos and return the replacement string, which
8578 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008579static PyObject *
8580unicode_translate_call_errorhandler(const char *errors,
8581 PyObject **errorHandler,
8582 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008583 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008584 Py_ssize_t startpos, Py_ssize_t endpos,
8585 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008586{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02008587 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008588
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008589 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008590 PyObject *restuple;
8591 PyObject *resunicode;
8592
8593 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008597 }
8598
8599 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008601 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008603
8604 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008606 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008608 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008609 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 Py_DECREF(restuple);
8611 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008612 }
8613 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 &resunicode, &i_newpos)) {
8615 Py_DECREF(restuple);
8616 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008617 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008618 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008620 else
8621 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008623 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 Py_DECREF(restuple);
8625 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008626 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008627 Py_INCREF(resunicode);
8628 Py_DECREF(restuple);
8629 return resunicode;
8630}
8631
8632/* Lookup the character ch in the mapping and put the result in result,
8633 which must be decrefed by the caller.
8634 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008635static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008637{
Christian Heimes217cfd12007-12-02 14:31:20 +00008638 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008639 PyObject *x;
8640
8641 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008642 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643 x = PyObject_GetItem(mapping, w);
8644 Py_DECREF(w);
8645 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8647 /* No mapping found means: use 1:1 mapping. */
8648 PyErr_Clear();
8649 *result = NULL;
8650 return 0;
8651 } else
8652 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008653 }
8654 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 *result = x;
8656 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008657 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008658 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008660 if (value < 0 || value > MAX_UNICODE) {
8661 PyErr_Format(PyExc_ValueError,
8662 "character mapping must be in range(0x%x)",
8663 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 Py_DECREF(x);
8665 return -1;
8666 }
8667 *result = x;
8668 return 0;
8669 }
8670 else if (PyUnicode_Check(x)) {
8671 *result = x;
8672 return 0;
8673 }
8674 else {
8675 /* wrong return value */
8676 PyErr_SetString(PyExc_TypeError,
8677 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008678 Py_DECREF(x);
8679 return -1;
8680 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681}
Victor Stinner1194ea02014-04-04 19:37:40 +02008682
8683/* lookup the character, write the result into the writer.
8684 Return 1 if the result was written into the writer, return 0 if the mapping
8685 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008686static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008687charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8688 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008689{
Victor Stinner1194ea02014-04-04 19:37:40 +02008690 PyObject *item;
8691
8692 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008694
8695 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008697 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008700 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008702
8703 if (item == Py_None) {
8704 Py_DECREF(item);
8705 return 0;
8706 }
8707
8708 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008709 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8710 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8711 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008712 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8713 Py_DECREF(item);
8714 return -1;
8715 }
8716 Py_DECREF(item);
8717 return 1;
8718 }
8719
8720 if (!PyUnicode_Check(item)) {
8721 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008723 }
8724
8725 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8726 Py_DECREF(item);
8727 return -1;
8728 }
8729
8730 Py_DECREF(item);
8731 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732}
8733
Victor Stinner89a76ab2014-04-05 11:44:04 +02008734static int
8735unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8736 Py_UCS1 *translate)
8737{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008738 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008739 int ret = 0;
8740
Victor Stinner89a76ab2014-04-05 11:44:04 +02008741 if (charmaptranslate_lookup(ch, mapping, &item)) {
8742 return -1;
8743 }
8744
8745 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008746 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008747 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008748 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008749 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008750 /* not found => default to 1:1 mapping */
8751 translate[ch] = ch;
8752 return 1;
8753 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008754 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008755 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008756 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8757 used it */
8758 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008759 /* invalid character or character outside ASCII:
8760 skip the fast translate */
8761 goto exit;
8762 }
8763 translate[ch] = (Py_UCS1)replace;
8764 }
8765 else if (PyUnicode_Check(item)) {
8766 Py_UCS4 replace;
8767
8768 if (PyUnicode_READY(item) == -1) {
8769 Py_DECREF(item);
8770 return -1;
8771 }
8772 if (PyUnicode_GET_LENGTH(item) != 1)
8773 goto exit;
8774
8775 replace = PyUnicode_READ_CHAR(item, 0);
8776 if (replace > 127)
8777 goto exit;
8778 translate[ch] = (Py_UCS1)replace;
8779 }
8780 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008781 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008782 goto exit;
8783 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008784 ret = 1;
8785
Benjamin Peterson1365de72014-04-07 20:15:41 -04008786 exit:
8787 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008788 return ret;
8789}
8790
8791/* Fast path for ascii => ascii translation. Return 1 if the whole string
8792 was translated into writer, return 0 if the input string was partially
8793 translated into writer, raise an exception and return -1 on error. */
8794static int
8795unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008796 _PyUnicodeWriter *writer, int ignore,
8797 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008798{
Victor Stinner872b2912014-04-05 14:27:07 +02008799 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008800 Py_ssize_t len;
8801 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008802 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008803
Victor Stinner89a76ab2014-04-05 11:44:04 +02008804 len = PyUnicode_GET_LENGTH(input);
8805
Victor Stinner872b2912014-04-05 14:27:07 +02008806 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008807
8808 in = PyUnicode_1BYTE_DATA(input);
8809 end = in + len;
8810
8811 assert(PyUnicode_IS_ASCII(writer->buffer));
8812 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8813 out = PyUnicode_1BYTE_DATA(writer->buffer);
8814
Victor Stinner872b2912014-04-05 14:27:07 +02008815 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008816 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008817 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008818 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008819 int translate = unicode_fast_translate_lookup(mapping, ch,
8820 ascii_table);
8821 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008822 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008823 if (translate == 0)
8824 goto exit;
8825 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008826 }
Victor Stinner872b2912014-04-05 14:27:07 +02008827 if (ch2 == 0xfe) {
8828 if (ignore)
8829 continue;
8830 goto exit;
8831 }
8832 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008833 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008834 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008835 }
Victor Stinner872b2912014-04-05 14:27:07 +02008836 res = 1;
8837
8838exit:
8839 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008840 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008841 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008842}
8843
Victor Stinner3222da22015-10-01 22:07:32 +02008844static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845_PyUnicode_TranslateCharmap(PyObject *input,
8846 PyObject *mapping,
8847 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008849 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008850 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 Py_ssize_t size, i;
8852 int kind;
8853 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008854 _PyUnicodeWriter writer;
8855 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008856 char *reason = "character maps to <undefined>";
8857 PyObject *errorHandler = NULL;
8858 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008859 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008860 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008861
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008863 PyErr_BadArgument();
8864 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008867 if (PyUnicode_READY(input) == -1)
8868 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008869 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008870 kind = PyUnicode_KIND(input);
8871 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008873 if (size == 0)
8874 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008876 /* allocate enough for a simple 1:1 translation without
8877 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008878 _PyUnicodeWriter_Init(&writer);
8879 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881
Victor Stinner872b2912014-04-05 14:27:07 +02008882 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8883
Victor Stinner33798672016-03-01 21:59:58 +01008884 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008885 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008886 if (PyUnicode_IS_ASCII(input)) {
8887 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8888 if (res < 0) {
8889 _PyUnicodeWriter_Dealloc(&writer);
8890 return NULL;
8891 }
8892 if (res == 1)
8893 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008894 }
Victor Stinner33798672016-03-01 21:59:58 +01008895 else {
8896 i = 0;
8897 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008899 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008901 int translate;
8902 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8903 Py_ssize_t newpos;
8904 /* startpos for collecting untranslatable chars */
8905 Py_ssize_t collstart;
8906 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008907 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008908
Victor Stinner1194ea02014-04-04 19:37:40 +02008909 ch = PyUnicode_READ(kind, data, i);
8910 translate = charmaptranslate_output(ch, mapping, &writer);
8911 if (translate < 0)
8912 goto onError;
8913
8914 if (translate != 0) {
8915 /* it worked => adjust input pointer */
8916 ++i;
8917 continue;
8918 }
8919
8920 /* untranslatable character */
8921 collstart = i;
8922 collend = i+1;
8923
8924 /* find all untranslatable characters */
8925 while (collend < size) {
8926 PyObject *x;
8927 ch = PyUnicode_READ(kind, data, collend);
8928 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008929 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008930 Py_XDECREF(x);
8931 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008932 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008933 ++collend;
8934 }
8935
8936 if (ignore) {
8937 i = collend;
8938 }
8939 else {
8940 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8941 reason, input, &exc,
8942 collstart, collend, &newpos);
8943 if (repunicode == NULL)
8944 goto onError;
8945 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008947 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008948 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008949 Py_DECREF(repunicode);
8950 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008951 }
8952 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008953 Py_XDECREF(exc);
8954 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008955 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956
Benjamin Peterson29060642009-01-31 22:14:21 +00008957 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008958 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008959 Py_XDECREF(exc);
8960 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961 return NULL;
8962}
8963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964/* Deprecated. Use PyUnicode_Translate instead. */
8965PyObject *
8966PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8967 Py_ssize_t size,
8968 PyObject *mapping,
8969 const char *errors)
8970{
Christian Heimes5f520f42012-09-11 14:03:25 +02008971 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8973 if (!unicode)
8974 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008975 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8976 Py_DECREF(unicode);
8977 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978}
8979
Alexander Belopolsky40018472011-02-26 01:02:56 +00008980PyObject *
8981PyUnicode_Translate(PyObject *str,
8982 PyObject *mapping,
8983 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008985 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02008986 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008987 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988}
Tim Petersced69f82003-09-16 20:30:58 +00008989
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008991fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008992{
8993 /* No need to call PyUnicode_READY(self) because this function is only
8994 called as a callback from fixup() which does it already. */
8995 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8996 const int kind = PyUnicode_KIND(self);
8997 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008998 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008999 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000 Py_ssize_t i;
9001
9002 for (i = 0; i < len; ++i) {
9003 ch = PyUnicode_READ(kind, data, i);
9004 fixed = 0;
9005 if (ch > 127) {
9006 if (Py_UNICODE_ISSPACE(ch))
9007 fixed = ' ';
9008 else {
9009 const int decimal = Py_UNICODE_TODECIMAL(ch);
9010 if (decimal >= 0)
9011 fixed = '0' + decimal;
9012 }
9013 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009014 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009015 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009016 PyUnicode_WRITE(kind, data, i, fixed);
9017 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009018 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009019 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009020 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009021 }
9022
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009023 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009024}
9025
9026PyObject *
9027_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9028{
9029 if (!PyUnicode_Check(unicode)) {
9030 PyErr_BadInternalCall();
9031 return NULL;
9032 }
9033 if (PyUnicode_READY(unicode) == -1)
9034 return NULL;
9035 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9036 /* If the string is already ASCII, just return the same string */
9037 Py_INCREF(unicode);
9038 return unicode;
9039 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009040 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009041}
9042
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009043PyObject *
9044PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9045 Py_ssize_t length)
9046{
Victor Stinnerf0124502011-11-21 23:12:56 +01009047 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009048 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009049 Py_UCS4 maxchar;
9050 enum PyUnicode_Kind kind;
9051 void *data;
9052
Victor Stinner99d7ad02012-02-22 13:37:39 +01009053 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009054 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009055 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009056 if (ch > 127) {
9057 int decimal = Py_UNICODE_TODECIMAL(ch);
9058 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009059 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009060 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009061 }
9062 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009063
9064 /* Copy to a new string */
9065 decimal = PyUnicode_New(length, maxchar);
9066 if (decimal == NULL)
9067 return decimal;
9068 kind = PyUnicode_KIND(decimal);
9069 data = PyUnicode_DATA(decimal);
9070 /* Iterate over code points */
9071 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009072 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009073 if (ch > 127) {
9074 int decimal = Py_UNICODE_TODECIMAL(ch);
9075 if (decimal >= 0)
9076 ch = '0' + decimal;
9077 }
9078 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009080 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009081}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009082/* --- Decimal Encoder ---------------------------------------------------- */
9083
Alexander Belopolsky40018472011-02-26 01:02:56 +00009084int
9085PyUnicode_EncodeDecimal(Py_UNICODE *s,
9086 Py_ssize_t length,
9087 char *output,
9088 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009089{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009090 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009091 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009092 enum PyUnicode_Kind kind;
9093 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009094
9095 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009096 PyErr_BadArgument();
9097 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009098 }
9099
Victor Stinner42bf7752011-11-21 22:52:58 +01009100 unicode = PyUnicode_FromUnicode(s, length);
9101 if (unicode == NULL)
9102 return -1;
9103
Benjamin Petersonbac79492012-01-14 13:34:47 -05009104 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009105 Py_DECREF(unicode);
9106 return -1;
9107 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009108 kind = PyUnicode_KIND(unicode);
9109 data = PyUnicode_DATA(unicode);
9110
Victor Stinnerb84d7232011-11-22 01:50:07 +01009111 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009112 PyObject *exc;
9113 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009114 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009115 Py_ssize_t startpos;
9116
9117 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009118
Benjamin Peterson29060642009-01-31 22:14:21 +00009119 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009120 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009121 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009123 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009124 decimal = Py_UNICODE_TODECIMAL(ch);
9125 if (decimal >= 0) {
9126 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009127 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009128 continue;
9129 }
9130 if (0 < ch && ch < 256) {
9131 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009132 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009133 continue;
9134 }
Victor Stinner6345be92011-11-25 20:09:01 +01009135
Victor Stinner42bf7752011-11-21 22:52:58 +01009136 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009137 exc = NULL;
9138 raise_encode_exception(&exc, "decimal", unicode,
9139 startpos, startpos+1,
9140 "invalid decimal Unicode string");
9141 Py_XDECREF(exc);
9142 Py_DECREF(unicode);
9143 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009144 }
9145 /* 0-terminate the output string */
9146 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009147 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009148 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009149}
9150
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151/* --- Helpers ------------------------------------------------------------ */
9152
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009153/* helper macro to fixup start/end slice values */
9154#define ADJUST_INDICES(start, end, len) \
9155 if (end > len) \
9156 end = len; \
9157 else if (end < 0) { \
9158 end += len; \
9159 if (end < 0) \
9160 end = 0; \
9161 } \
9162 if (start < 0) { \
9163 start += len; \
9164 if (start < 0) \
9165 start = 0; \
9166 }
9167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009169any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009170 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009171 Py_ssize_t end,
9172 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009173{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009174 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175 void *buf1, *buf2;
9176 Py_ssize_t len1, len2, result;
9177
9178 kind1 = PyUnicode_KIND(s1);
9179 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009180 if (kind1 < kind2)
9181 return -1;
9182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009183 len1 = PyUnicode_GET_LENGTH(s1);
9184 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009185 ADJUST_INDICES(start, end, len1);
9186 if (end - start < len2)
9187 return -1;
9188
9189 buf1 = PyUnicode_DATA(s1);
9190 buf2 = PyUnicode_DATA(s2);
9191 if (len2 == 1) {
9192 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9193 result = findchar((const char *)buf1 + kind1*start,
9194 kind1, end - start, ch, direction);
9195 if (result == -1)
9196 return -1;
9197 else
9198 return start + result;
9199 }
9200
9201 if (kind2 != kind1) {
9202 buf2 = _PyUnicode_AsKind(s2, kind1);
9203 if (!buf2)
9204 return -2;
9205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009206
Victor Stinner794d5672011-10-10 03:21:36 +02009207 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009208 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009209 case PyUnicode_1BYTE_KIND:
9210 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9211 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9212 else
9213 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9214 break;
9215 case PyUnicode_2BYTE_KIND:
9216 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9217 break;
9218 case PyUnicode_4BYTE_KIND:
9219 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9220 break;
9221 default:
9222 assert(0); result = -2;
9223 }
9224 }
9225 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009226 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009227 case PyUnicode_1BYTE_KIND:
9228 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9229 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9230 else
9231 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9232 break;
9233 case PyUnicode_2BYTE_KIND:
9234 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9235 break;
9236 case PyUnicode_4BYTE_KIND:
9237 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9238 break;
9239 default:
9240 assert(0); result = -2;
9241 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 }
9243
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009244 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245 PyMem_Free(buf2);
9246
9247 return result;
9248}
9249
9250Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009251_PyUnicode_InsertThousandsGrouping(
9252 PyObject *unicode, Py_ssize_t index,
9253 Py_ssize_t n_buffer,
9254 void *digits, Py_ssize_t n_digits,
9255 Py_ssize_t min_width,
9256 const char *grouping, PyObject *thousands_sep,
9257 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258{
Victor Stinner41a863c2012-02-24 00:37:51 +01009259 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009260 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009261 Py_ssize_t thousands_sep_len;
9262 Py_ssize_t len;
9263
9264 if (unicode != NULL) {
9265 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009266 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009267 }
9268 else {
9269 kind = PyUnicode_1BYTE_KIND;
9270 data = NULL;
9271 }
9272 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9273 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9274 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9275 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009276 if (thousands_sep_kind < kind) {
9277 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9278 if (!thousands_sep_data)
9279 return -1;
9280 }
9281 else {
9282 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9283 if (!data)
9284 return -1;
9285 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009286 }
9287
Benjamin Petersonead6b532011-12-20 17:23:42 -06009288 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009290 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009291 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009292 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009293 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009294 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009295 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009296 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009297 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009298 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009299 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009300 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009302 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009303 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009304 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009305 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009306 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009308 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009309 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009310 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009311 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009312 break;
9313 default:
9314 assert(0);
9315 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009317 if (unicode != NULL && thousands_sep_kind != kind) {
9318 if (thousands_sep_kind < kind)
9319 PyMem_Free(thousands_sep_data);
9320 else
9321 PyMem_Free(data);
9322 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009323 if (unicode == NULL) {
9324 *maxchar = 127;
9325 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009326 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009327 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009328 }
9329 }
9330 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331}
9332
9333
Alexander Belopolsky40018472011-02-26 01:02:56 +00009334Py_ssize_t
9335PyUnicode_Count(PyObject *str,
9336 PyObject *substr,
9337 Py_ssize_t start,
9338 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009340 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009341 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 void *buf1 = NULL, *buf2 = NULL;
9343 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009344
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009345 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009346 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009347
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009348 kind1 = PyUnicode_KIND(str);
9349 kind2 = PyUnicode_KIND(substr);
9350 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009351 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009352
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009353 len1 = PyUnicode_GET_LENGTH(str);
9354 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009356 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009357 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009358
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009359 buf1 = PyUnicode_DATA(str);
9360 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009361 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009362 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009363 if (!buf2)
9364 goto onError;
9365 }
9366
9367 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009368 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009369 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009370 result = asciilib_count(
9371 ((Py_UCS1*)buf1) + start, end - start,
9372 buf2, len2, PY_SSIZE_T_MAX
9373 );
9374 else
9375 result = ucs1lib_count(
9376 ((Py_UCS1*)buf1) + start, end - start,
9377 buf2, len2, PY_SSIZE_T_MAX
9378 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 break;
9380 case PyUnicode_2BYTE_KIND:
9381 result = ucs2lib_count(
9382 ((Py_UCS2*)buf1) + start, end - start,
9383 buf2, len2, PY_SSIZE_T_MAX
9384 );
9385 break;
9386 case PyUnicode_4BYTE_KIND:
9387 result = ucs4lib_count(
9388 ((Py_UCS4*)buf1) + start, end - start,
9389 buf2, len2, PY_SSIZE_T_MAX
9390 );
9391 break;
9392 default:
9393 assert(0); result = 0;
9394 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009395
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009396 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009397 PyMem_Free(buf2);
9398
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009400 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009401 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 PyMem_Free(buf2);
9403 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404}
9405
Alexander Belopolsky40018472011-02-26 01:02:56 +00009406Py_ssize_t
9407PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009408 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009409 Py_ssize_t start,
9410 Py_ssize_t end,
9411 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009412{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009413 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009414 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009415
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009416 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009417}
9418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419Py_ssize_t
9420PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9421 Py_ssize_t start, Py_ssize_t end,
9422 int direction)
9423{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009424 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009425 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009426 if (PyUnicode_READY(str) == -1)
9427 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009428 if (start < 0 || end < 0) {
9429 PyErr_SetString(PyExc_IndexError, "string index out of range");
9430 return -2;
9431 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 if (end > PyUnicode_GET_LENGTH(str))
9433 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009434 if (start >= end)
9435 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009437 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9438 kind, end-start, ch, direction);
9439 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009441 else
9442 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443}
9444
Alexander Belopolsky40018472011-02-26 01:02:56 +00009445static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009446tailmatch(PyObject *self,
9447 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009448 Py_ssize_t start,
9449 Py_ssize_t end,
9450 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 int kind_self;
9453 int kind_sub;
9454 void *data_self;
9455 void *data_sub;
9456 Py_ssize_t offset;
9457 Py_ssize_t i;
9458 Py_ssize_t end_sub;
9459
9460 if (PyUnicode_READY(self) == -1 ||
9461 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009462 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9465 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009467 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009468
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009469 if (PyUnicode_GET_LENGTH(substring) == 0)
9470 return 1;
9471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 kind_self = PyUnicode_KIND(self);
9473 data_self = PyUnicode_DATA(self);
9474 kind_sub = PyUnicode_KIND(substring);
9475 data_sub = PyUnicode_DATA(substring);
9476 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9477
9478 if (direction > 0)
9479 offset = end;
9480 else
9481 offset = start;
9482
9483 if (PyUnicode_READ(kind_self, data_self, offset) ==
9484 PyUnicode_READ(kind_sub, data_sub, 0) &&
9485 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9486 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9487 /* If both are of the same kind, memcmp is sufficient */
9488 if (kind_self == kind_sub) {
9489 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009490 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491 data_sub,
9492 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009493 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009495 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 else {
9497 /* We do not need to compare 0 and len(substring)-1 because
9498 the if statement above ensured already that they are equal
9499 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 for (i = 1; i < end_sub; ++i) {
9501 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9502 PyUnicode_READ(kind_sub, data_sub, i))
9503 return 0;
9504 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009505 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507 }
9508
9509 return 0;
9510}
9511
Alexander Belopolsky40018472011-02-26 01:02:56 +00009512Py_ssize_t
9513PyUnicode_Tailmatch(PyObject *str,
9514 PyObject *substr,
9515 Py_ssize_t start,
9516 Py_ssize_t end,
9517 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009519 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009520 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009521
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009522 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523}
9524
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525/* Apply fixfct filter to the Unicode object self and return a
9526 reference to the modified object */
9527
Alexander Belopolsky40018472011-02-26 01:02:56 +00009528static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009529fixup(PyObject *self,
9530 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 PyObject *u;
9533 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009534 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009536 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009538 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009539 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 /* fix functions return the new maximum character in a string,
9542 if the kind of the resulting unicode object does not change,
9543 everything is fine. Otherwise we need to change the string kind
9544 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009545 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009546
9547 if (maxchar_new == 0) {
9548 /* no changes */;
9549 if (PyUnicode_CheckExact(self)) {
9550 Py_DECREF(u);
9551 Py_INCREF(self);
9552 return self;
9553 }
9554 else
9555 return u;
9556 }
9557
Victor Stinnere6abb482012-05-02 01:15:40 +02009558 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009559
Victor Stinnereaab6042011-12-11 22:22:39 +01009560 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009562
9563 /* In case the maximum character changed, we need to
9564 convert the string to the new category. */
9565 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9566 if (v == NULL) {
9567 Py_DECREF(u);
9568 return NULL;
9569 }
9570 if (maxchar_new > maxchar_old) {
9571 /* If the maxchar increased so that the kind changed, not all
9572 characters are representable anymore and we need to fix the
9573 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009574 _PyUnicode_FastCopyCharacters(v, 0,
9575 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009576 maxchar_old = fixfct(v);
9577 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009578 }
9579 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009580 _PyUnicode_FastCopyCharacters(v, 0,
9581 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009583 Py_DECREF(u);
9584 assert(_PyUnicode_CheckConsistency(v, 1));
9585 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586}
9587
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009588static PyObject *
9589ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009591 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9592 char *resdata, *data = PyUnicode_DATA(self);
9593 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009594
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009595 res = PyUnicode_New(len, 127);
9596 if (res == NULL)
9597 return NULL;
9598 resdata = PyUnicode_DATA(res);
9599 if (lower)
9600 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009602 _Py_bytes_upper(resdata, data, len);
9603 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009604}
9605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009607handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009609 Py_ssize_t j;
9610 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009611 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009612 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009613
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009614 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9615
9616 where ! is a negation and \p{xxx} is a character with property xxx.
9617 */
9618 for (j = i - 1; j >= 0; j--) {
9619 c = PyUnicode_READ(kind, data, j);
9620 if (!_PyUnicode_IsCaseIgnorable(c))
9621 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009622 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009623 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9624 if (final_sigma) {
9625 for (j = i + 1; j < length; j++) {
9626 c = PyUnicode_READ(kind, data, j);
9627 if (!_PyUnicode_IsCaseIgnorable(c))
9628 break;
9629 }
9630 final_sigma = j == length || !_PyUnicode_IsCased(c);
9631 }
9632 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633}
9634
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009635static int
9636lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9637 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009639 /* Obscure special case. */
9640 if (c == 0x3A3) {
9641 mapped[0] = handle_capital_sigma(kind, data, length, i);
9642 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009644 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645}
9646
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009647static Py_ssize_t
9648do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009650 Py_ssize_t i, k = 0;
9651 int n_res, j;
9652 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009653
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009654 c = PyUnicode_READ(kind, data, 0);
9655 n_res = _PyUnicode_ToUpperFull(c, mapped);
9656 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009657 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009659 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009660 for (i = 1; i < length; i++) {
9661 c = PyUnicode_READ(kind, data, i);
9662 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9663 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009664 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009665 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009666 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009667 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009668 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009669}
9670
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009671static Py_ssize_t
9672do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9673 Py_ssize_t i, k = 0;
9674
9675 for (i = 0; i < length; i++) {
9676 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9677 int n_res, j;
9678 if (Py_UNICODE_ISUPPER(c)) {
9679 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9680 }
9681 else if (Py_UNICODE_ISLOWER(c)) {
9682 n_res = _PyUnicode_ToUpperFull(c, mapped);
9683 }
9684 else {
9685 n_res = 1;
9686 mapped[0] = c;
9687 }
9688 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009689 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009690 res[k++] = mapped[j];
9691 }
9692 }
9693 return k;
9694}
9695
9696static Py_ssize_t
9697do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9698 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009700 Py_ssize_t i, k = 0;
9701
9702 for (i = 0; i < length; i++) {
9703 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9704 int n_res, j;
9705 if (lower)
9706 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9707 else
9708 n_res = _PyUnicode_ToUpperFull(c, mapped);
9709 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009710 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 res[k++] = mapped[j];
9712 }
9713 }
9714 return k;
9715}
9716
9717static Py_ssize_t
9718do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9719{
9720 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9721}
9722
9723static Py_ssize_t
9724do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9725{
9726 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9727}
9728
Benjamin Petersone51757f2012-01-12 21:10:29 -05009729static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009730do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9731{
9732 Py_ssize_t i, k = 0;
9733
9734 for (i = 0; i < length; i++) {
9735 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9736 Py_UCS4 mapped[3];
9737 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9738 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009739 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009740 res[k++] = mapped[j];
9741 }
9742 }
9743 return k;
9744}
9745
9746static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009747do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9748{
9749 Py_ssize_t i, k = 0;
9750 int previous_is_cased;
9751
9752 previous_is_cased = 0;
9753 for (i = 0; i < length; i++) {
9754 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9755 Py_UCS4 mapped[3];
9756 int n_res, j;
9757
9758 if (previous_is_cased)
9759 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9760 else
9761 n_res = _PyUnicode_ToTitleFull(c, mapped);
9762
9763 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009764 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009765 res[k++] = mapped[j];
9766 }
9767
9768 previous_is_cased = _PyUnicode_IsCased(c);
9769 }
9770 return k;
9771}
9772
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009773static PyObject *
9774case_operation(PyObject *self,
9775 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9776{
9777 PyObject *res = NULL;
9778 Py_ssize_t length, newlength = 0;
9779 int kind, outkind;
9780 void *data, *outdata;
9781 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9782
Benjamin Petersoneea48462012-01-16 14:28:50 -05009783 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009784
9785 kind = PyUnicode_KIND(self);
9786 data = PyUnicode_DATA(self);
9787 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009788 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009789 PyErr_SetString(PyExc_OverflowError, "string is too long");
9790 return NULL;
9791 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009792 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009793 if (tmp == NULL)
9794 return PyErr_NoMemory();
9795 newlength = perform(kind, data, length, tmp, &maxchar);
9796 res = PyUnicode_New(newlength, maxchar);
9797 if (res == NULL)
9798 goto leave;
9799 tmpend = tmp + newlength;
9800 outdata = PyUnicode_DATA(res);
9801 outkind = PyUnicode_KIND(res);
9802 switch (outkind) {
9803 case PyUnicode_1BYTE_KIND:
9804 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9805 break;
9806 case PyUnicode_2BYTE_KIND:
9807 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9808 break;
9809 case PyUnicode_4BYTE_KIND:
9810 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9811 break;
9812 default:
9813 assert(0);
9814 break;
9815 }
9816 leave:
9817 PyMem_FREE(tmp);
9818 return res;
9819}
9820
Tim Peters8ce9f162004-08-27 01:49:32 +00009821PyObject *
9822PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009824 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009825 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009826 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009827 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009828 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9829 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009830 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009832 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009834 int use_memcpy;
9835 unsigned char *res_data = NULL, *sep_data = NULL;
9836 PyObject *last_obj;
9837 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009838
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009839 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009840 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009841 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009842 }
9843
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009844 /* NOTE: the following code can't call back into Python code,
9845 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009846 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009847
Tim Peters05eba1f2004-08-27 21:32:02 +00009848 seqlen = PySequence_Fast_GET_SIZE(fseq);
9849 /* If empty sequence, return u"". */
9850 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009851 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009852 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009853 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009854
Tim Peters05eba1f2004-08-27 21:32:02 +00009855 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009856 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009857 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009858 if (seqlen == 1) {
9859 if (PyUnicode_CheckExact(items[0])) {
9860 res = items[0];
9861 Py_INCREF(res);
9862 Py_DECREF(fseq);
9863 return res;
9864 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009865 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009866 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009867 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009868 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009869 /* Set up sep and seplen */
9870 if (separator == NULL) {
9871 /* fall back to a blank space separator */
9872 sep = PyUnicode_FromOrdinal(' ');
9873 if (!sep)
9874 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009875 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009876 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009877 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009878 else {
9879 if (!PyUnicode_Check(separator)) {
9880 PyErr_Format(PyExc_TypeError,
9881 "separator: expected str instance,"
9882 " %.80s found",
9883 Py_TYPE(separator)->tp_name);
9884 goto onError;
9885 }
9886 if (PyUnicode_READY(separator))
9887 goto onError;
9888 sep = separator;
9889 seplen = PyUnicode_GET_LENGTH(separator);
9890 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9891 /* inc refcount to keep this code path symmetric with the
9892 above case of a blank separator */
9893 Py_INCREF(sep);
9894 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009895 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009896 }
9897
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009898 /* There are at least two things to join, or else we have a subclass
9899 * of str in the sequence.
9900 * Do a pre-pass to figure out the total amount of space we'll
9901 * need (sz), and see whether all argument are strings.
9902 */
9903 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009904#ifdef Py_DEBUG
9905 use_memcpy = 0;
9906#else
9907 use_memcpy = 1;
9908#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009909 for (i = 0; i < seqlen; i++) {
9910 const Py_ssize_t old_sz = sz;
9911 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009912 if (!PyUnicode_Check(item)) {
9913 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009914 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009915 " %.80s found",
9916 i, Py_TYPE(item)->tp_name);
9917 goto onError;
9918 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009919 if (PyUnicode_READY(item) == -1)
9920 goto onError;
9921 sz += PyUnicode_GET_LENGTH(item);
9922 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009923 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009924 if (i != 0)
9925 sz += seplen;
9926 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9927 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009928 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009929 goto onError;
9930 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009931 if (use_memcpy && last_obj != NULL) {
9932 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9933 use_memcpy = 0;
9934 }
9935 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009936 }
Tim Petersced69f82003-09-16 20:30:58 +00009937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009939 if (res == NULL)
9940 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009941
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009942 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009943#ifdef Py_DEBUG
9944 use_memcpy = 0;
9945#else
9946 if (use_memcpy) {
9947 res_data = PyUnicode_1BYTE_DATA(res);
9948 kind = PyUnicode_KIND(res);
9949 if (seplen != 0)
9950 sep_data = PyUnicode_1BYTE_DATA(sep);
9951 }
9952#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009953 if (use_memcpy) {
9954 for (i = 0; i < seqlen; ++i) {
9955 Py_ssize_t itemlen;
9956 item = items[i];
9957
9958 /* Copy item, and maybe the separator. */
9959 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009960 Py_MEMCPY(res_data,
9961 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009962 kind * seplen);
9963 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009964 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009965
9966 itemlen = PyUnicode_GET_LENGTH(item);
9967 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009968 Py_MEMCPY(res_data,
9969 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009970 kind * itemlen);
9971 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009972 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009973 }
9974 assert(res_data == PyUnicode_1BYTE_DATA(res)
9975 + kind * PyUnicode_GET_LENGTH(res));
9976 }
9977 else {
9978 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9979 Py_ssize_t itemlen;
9980 item = items[i];
9981
9982 /* Copy item, and maybe the separator. */
9983 if (i && seplen != 0) {
9984 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9985 res_offset += seplen;
9986 }
9987
9988 itemlen = PyUnicode_GET_LENGTH(item);
9989 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009990 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009991 res_offset += itemlen;
9992 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009993 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009994 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009995 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009996
Tim Peters05eba1f2004-08-27 21:32:02 +00009997 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009999 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010001
Benjamin Peterson29060642009-01-31 22:14:21 +000010002 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010003 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010005 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010006 return NULL;
10007}
10008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009#define FILL(kind, data, value, start, length) \
10010 do { \
10011 Py_ssize_t i_ = 0; \
10012 assert(kind != PyUnicode_WCHAR_KIND); \
10013 switch ((kind)) { \
10014 case PyUnicode_1BYTE_KIND: { \
10015 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010016 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 break; \
10018 } \
10019 case PyUnicode_2BYTE_KIND: { \
10020 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10021 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10022 break; \
10023 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010024 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10026 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10027 break; \
10028 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010029 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 } \
10031 } while (0)
10032
Victor Stinnerd3f08822012-05-29 12:57:52 +020010033void
10034_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10035 Py_UCS4 fill_char)
10036{
10037 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10038 const void *data = PyUnicode_DATA(unicode);
10039 assert(PyUnicode_IS_READY(unicode));
10040 assert(unicode_modifiable(unicode));
10041 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10042 assert(start >= 0);
10043 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10044 FILL(kind, data, fill_char, start, length);
10045}
10046
Victor Stinner3fe55312012-01-04 00:33:50 +010010047Py_ssize_t
10048PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10049 Py_UCS4 fill_char)
10050{
10051 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010052
10053 if (!PyUnicode_Check(unicode)) {
10054 PyErr_BadInternalCall();
10055 return -1;
10056 }
10057 if (PyUnicode_READY(unicode) == -1)
10058 return -1;
10059 if (unicode_check_modifiable(unicode))
10060 return -1;
10061
Victor Stinnerd3f08822012-05-29 12:57:52 +020010062 if (start < 0) {
10063 PyErr_SetString(PyExc_IndexError, "string index out of range");
10064 return -1;
10065 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010066 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10067 PyErr_SetString(PyExc_ValueError,
10068 "fill character is bigger than "
10069 "the string maximum character");
10070 return -1;
10071 }
10072
10073 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10074 length = Py_MIN(maxlen, length);
10075 if (length <= 0)
10076 return 0;
10077
Victor Stinnerd3f08822012-05-29 12:57:52 +020010078 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010079 return length;
10080}
10081
Victor Stinner9310abb2011-10-05 00:59:23 +020010082static PyObject *
10083pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010084 Py_ssize_t left,
10085 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010087{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 PyObject *u;
10089 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010090 int kind;
10091 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010092
10093 if (left < 0)
10094 left = 0;
10095 if (right < 0)
10096 right = 0;
10097
Victor Stinnerc4b49542011-12-11 22:44:26 +010010098 if (left == 0 && right == 0)
10099 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010100
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10102 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010103 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10104 return NULL;
10105 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010107 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010109 if (!u)
10110 return NULL;
10111
10112 kind = PyUnicode_KIND(u);
10113 data = PyUnicode_DATA(u);
10114 if (left)
10115 FILL(kind, data, fill, 0, left);
10116 if (right)
10117 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010118 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010119 assert(_PyUnicode_CheckConsistency(u, 1));
10120 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010121}
10122
Alexander Belopolsky40018472011-02-26 01:02:56 +000010123PyObject *
10124PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010126 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010127
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010128 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010129 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010130
Benjamin Petersonead6b532011-12-20 17:23:42 -060010131 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010133 if (PyUnicode_IS_ASCII(string))
10134 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010135 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010136 PyUnicode_GET_LENGTH(string), keepends);
10137 else
10138 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010139 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010140 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 break;
10142 case PyUnicode_2BYTE_KIND:
10143 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010144 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 PyUnicode_GET_LENGTH(string), keepends);
10146 break;
10147 case PyUnicode_4BYTE_KIND:
10148 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010149 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 PyUnicode_GET_LENGTH(string), keepends);
10151 break;
10152 default:
10153 assert(0);
10154 list = 0;
10155 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010156 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157}
10158
Alexander Belopolsky40018472011-02-26 01:02:56 +000010159static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010160split(PyObject *self,
10161 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010162 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010164 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 void *buf1, *buf2;
10166 Py_ssize_t len1, len2;
10167 PyObject* out;
10168
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010170 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 if (PyUnicode_READY(self) == -1)
10173 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010176 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010178 if (PyUnicode_IS_ASCII(self))
10179 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010180 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010181 PyUnicode_GET_LENGTH(self), maxcount
10182 );
10183 else
10184 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010185 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010186 PyUnicode_GET_LENGTH(self), maxcount
10187 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 case PyUnicode_2BYTE_KIND:
10189 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010190 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 PyUnicode_GET_LENGTH(self), maxcount
10192 );
10193 case PyUnicode_4BYTE_KIND:
10194 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010195 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 PyUnicode_GET_LENGTH(self), maxcount
10197 );
10198 default:
10199 assert(0);
10200 return NULL;
10201 }
10202
10203 if (PyUnicode_READY(substring) == -1)
10204 return NULL;
10205
10206 kind1 = PyUnicode_KIND(self);
10207 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 len1 = PyUnicode_GET_LENGTH(self);
10209 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010210 if (kind1 < kind2 || len1 < len2) {
10211 out = PyList_New(1);
10212 if (out == NULL)
10213 return NULL;
10214 Py_INCREF(self);
10215 PyList_SET_ITEM(out, 0, self);
10216 return out;
10217 }
10218 buf1 = PyUnicode_DATA(self);
10219 buf2 = PyUnicode_DATA(substring);
10220 if (kind2 != kind1) {
10221 buf2 = _PyUnicode_AsKind(substring, kind1);
10222 if (!buf2)
10223 return NULL;
10224 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010226 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010228 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10229 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010230 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010231 else
10232 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010233 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 break;
10235 case PyUnicode_2BYTE_KIND:
10236 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010237 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 break;
10239 case PyUnicode_4BYTE_KIND:
10240 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010241 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 break;
10243 default:
10244 out = NULL;
10245 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010246 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 PyMem_Free(buf2);
10248 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249}
10250
Alexander Belopolsky40018472011-02-26 01:02:56 +000010251static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010252rsplit(PyObject *self,
10253 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010254 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010255{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010256 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 void *buf1, *buf2;
10258 Py_ssize_t len1, len2;
10259 PyObject* out;
10260
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010261 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010262 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 if (PyUnicode_READY(self) == -1)
10265 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010268 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010270 if (PyUnicode_IS_ASCII(self))
10271 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010272 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010273 PyUnicode_GET_LENGTH(self), maxcount
10274 );
10275 else
10276 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010277 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010278 PyUnicode_GET_LENGTH(self), maxcount
10279 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 case PyUnicode_2BYTE_KIND:
10281 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010282 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 PyUnicode_GET_LENGTH(self), maxcount
10284 );
10285 case PyUnicode_4BYTE_KIND:
10286 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010287 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 PyUnicode_GET_LENGTH(self), maxcount
10289 );
10290 default:
10291 assert(0);
10292 return NULL;
10293 }
10294
10295 if (PyUnicode_READY(substring) == -1)
10296 return NULL;
10297
10298 kind1 = PyUnicode_KIND(self);
10299 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 len1 = PyUnicode_GET_LENGTH(self);
10301 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010302 if (kind1 < kind2 || len1 < len2) {
10303 out = PyList_New(1);
10304 if (out == NULL)
10305 return NULL;
10306 Py_INCREF(self);
10307 PyList_SET_ITEM(out, 0, self);
10308 return out;
10309 }
10310 buf1 = PyUnicode_DATA(self);
10311 buf2 = PyUnicode_DATA(substring);
10312 if (kind2 != kind1) {
10313 buf2 = _PyUnicode_AsKind(substring, kind1);
10314 if (!buf2)
10315 return NULL;
10316 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010318 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010320 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10321 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010322 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010323 else
10324 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010325 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 break;
10327 case PyUnicode_2BYTE_KIND:
10328 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010329 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 break;
10331 case PyUnicode_4BYTE_KIND:
10332 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010333 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 break;
10335 default:
10336 out = NULL;
10337 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010338 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 PyMem_Free(buf2);
10340 return out;
10341}
10342
10343static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010344anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10345 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010347 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010349 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10350 return asciilib_find(buf1, len1, buf2, len2, offset);
10351 else
10352 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 case PyUnicode_2BYTE_KIND:
10354 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10355 case PyUnicode_4BYTE_KIND:
10356 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10357 }
10358 assert(0);
10359 return -1;
10360}
10361
10362static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010363anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10364 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010366 switch (kind) {
10367 case PyUnicode_1BYTE_KIND:
10368 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10369 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10370 else
10371 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10372 case PyUnicode_2BYTE_KIND:
10373 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10374 case PyUnicode_4BYTE_KIND:
10375 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10376 }
10377 assert(0);
10378 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010379}
10380
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010381static void
10382replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10383 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10384{
10385 int kind = PyUnicode_KIND(u);
10386 void *data = PyUnicode_DATA(u);
10387 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10388 if (kind == PyUnicode_1BYTE_KIND) {
10389 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10390 (Py_UCS1 *)data + len,
10391 u1, u2, maxcount);
10392 }
10393 else if (kind == PyUnicode_2BYTE_KIND) {
10394 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10395 (Py_UCS2 *)data + len,
10396 u1, u2, maxcount);
10397 }
10398 else {
10399 assert(kind == PyUnicode_4BYTE_KIND);
10400 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10401 (Py_UCS4 *)data + len,
10402 u1, u2, maxcount);
10403 }
10404}
10405
Alexander Belopolsky40018472011-02-26 01:02:56 +000010406static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407replace(PyObject *self, PyObject *str1,
10408 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010409{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 PyObject *u;
10411 char *sbuf = PyUnicode_DATA(self);
10412 char *buf1 = PyUnicode_DATA(str1);
10413 char *buf2 = PyUnicode_DATA(str2);
10414 int srelease = 0, release1 = 0, release2 = 0;
10415 int skind = PyUnicode_KIND(self);
10416 int kind1 = PyUnicode_KIND(str1);
10417 int kind2 = PyUnicode_KIND(str2);
10418 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10419 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10420 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010421 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010422 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010423
10424 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010425 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010427 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428
Victor Stinner59de0ee2011-10-07 10:01:28 +020010429 if (str1 == str2)
10430 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431
Victor Stinner49a0a212011-10-12 23:46:10 +020010432 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010433 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10434 if (maxchar < maxchar_str1)
10435 /* substring too wide to be present */
10436 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010437 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10438 /* Replacing str1 with str2 may cause a maxchar reduction in the
10439 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010440 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010441 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010444 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010446 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010448 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010449 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010450 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010451
Victor Stinner69ed0f42013-04-09 21:48:24 +020010452 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010453 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010454 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010455 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010456 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010458 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010460
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010461 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10462 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010463 }
10464 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 int rkind = skind;
10466 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010467 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 if (kind1 < rkind) {
10470 /* widen substring */
10471 buf1 = _PyUnicode_AsKind(str1, rkind);
10472 if (!buf1) goto error;
10473 release1 = 1;
10474 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010475 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010476 if (i < 0)
10477 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 if (rkind > kind2) {
10479 /* widen replacement */
10480 buf2 = _PyUnicode_AsKind(str2, rkind);
10481 if (!buf2) goto error;
10482 release2 = 1;
10483 }
10484 else if (rkind < kind2) {
10485 /* widen self and buf1 */
10486 rkind = kind2;
10487 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010488 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 sbuf = _PyUnicode_AsKind(self, rkind);
10490 if (!sbuf) goto error;
10491 srelease = 1;
10492 buf1 = _PyUnicode_AsKind(str1, rkind);
10493 if (!buf1) goto error;
10494 release1 = 1;
10495 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010496 u = PyUnicode_New(slen, maxchar);
10497 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010499 assert(PyUnicode_KIND(u) == rkind);
10500 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010501
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010502 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010503 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010504 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010506 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010508
10509 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010510 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010511 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010512 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010513 if (i == -1)
10514 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010515 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010517 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010521 }
10522 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010524 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 int rkind = skind;
10526 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010529 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 buf1 = _PyUnicode_AsKind(str1, rkind);
10531 if (!buf1) goto error;
10532 release1 = 1;
10533 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010534 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010535 if (n == 0)
10536 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010538 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 buf2 = _PyUnicode_AsKind(str2, rkind);
10540 if (!buf2) goto error;
10541 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010544 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 rkind = kind2;
10546 sbuf = _PyUnicode_AsKind(self, rkind);
10547 if (!sbuf) goto error;
10548 srelease = 1;
10549 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010550 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 buf1 = _PyUnicode_AsKind(str1, rkind);
10552 if (!buf1) goto error;
10553 release1 = 1;
10554 }
10555 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10556 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010557 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 PyErr_SetString(PyExc_OverflowError,
10559 "replace string is too long");
10560 goto error;
10561 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010562 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010563 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010564 _Py_INCREF_UNICODE_EMPTY();
10565 if (!unicode_empty)
10566 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010567 u = unicode_empty;
10568 goto done;
10569 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010570 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 PyErr_SetString(PyExc_OverflowError,
10572 "replace string is too long");
10573 goto error;
10574 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010575 u = PyUnicode_New(new_size, maxchar);
10576 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010578 assert(PyUnicode_KIND(u) == rkind);
10579 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 ires = i = 0;
10581 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010582 while (n-- > 0) {
10583 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010584 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010585 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010586 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010587 if (j == -1)
10588 break;
10589 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010590 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010591 memcpy(res + rkind * ires,
10592 sbuf + rkind * i,
10593 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010595 }
10596 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010598 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010600 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010602 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010604 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010606 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010607 memcpy(res + rkind * ires,
10608 sbuf + rkind * i,
10609 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010610 }
10611 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010612 /* interleave */
10613 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010614 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010616 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010618 if (--n <= 0)
10619 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010620 memcpy(res + rkind * ires,
10621 sbuf + rkind * i,
10622 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 ires++;
10624 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010625 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010626 memcpy(res + rkind * ires,
10627 sbuf + rkind * i,
10628 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010629 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010630 }
10631
10632 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010633 unicode_adjust_maxchar(&u);
10634 if (u == NULL)
10635 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010636 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010637
10638 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 if (srelease)
10640 PyMem_FREE(sbuf);
10641 if (release1)
10642 PyMem_FREE(buf1);
10643 if (release2)
10644 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010645 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010647
Benjamin Peterson29060642009-01-31 22:14:21 +000010648 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010649 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 if (srelease)
10651 PyMem_FREE(sbuf);
10652 if (release1)
10653 PyMem_FREE(buf1);
10654 if (release2)
10655 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010656 return unicode_result_unchanged(self);
10657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 error:
10659 if (srelease && sbuf)
10660 PyMem_FREE(sbuf);
10661 if (release1 && buf1)
10662 PyMem_FREE(buf1);
10663 if (release2 && buf2)
10664 PyMem_FREE(buf2);
10665 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666}
10667
10668/* --- Unicode Object Methods --------------------------------------------- */
10669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010670PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010671 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672\n\
10673Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010674characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010675
10676static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010677unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010679 if (PyUnicode_READY(self) == -1)
10680 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010681 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682}
10683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010684PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010685 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686\n\
10687Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010688have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010689
10690static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010691unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010692{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010693 if (PyUnicode_READY(self) == -1)
10694 return NULL;
10695 if (PyUnicode_GET_LENGTH(self) == 0)
10696 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010697 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698}
10699
Benjamin Petersond5890c82012-01-14 13:23:30 -050010700PyDoc_STRVAR(casefold__doc__,
10701 "S.casefold() -> str\n\
10702\n\
10703Return a version of S suitable for caseless comparisons.");
10704
10705static PyObject *
10706unicode_casefold(PyObject *self)
10707{
10708 if (PyUnicode_READY(self) == -1)
10709 return NULL;
10710 if (PyUnicode_IS_ASCII(self))
10711 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010712 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010713}
10714
10715
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010716/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010717
10718static int
10719convert_uc(PyObject *obj, void *addr)
10720{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010722
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010723 if (!PyUnicode_Check(obj)) {
10724 PyErr_Format(PyExc_TypeError,
10725 "The fill character must be a unicode character, "
10726 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010727 return 0;
10728 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010729 if (PyUnicode_READY(obj) < 0)
10730 return 0;
10731 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010732 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010733 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010734 return 0;
10735 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010736 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010737 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010738}
10739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010740PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010741 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010743Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010744done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745
10746static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010747unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010749 Py_ssize_t marg, left;
10750 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010751 Py_UCS4 fillchar = ' ';
10752
Victor Stinnere9a29352011-10-01 02:14:59 +020010753 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755
Benjamin Petersonbac79492012-01-14 13:34:47 -050010756 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757 return NULL;
10758
Victor Stinnerc4b49542011-12-11 22:44:26 +010010759 if (PyUnicode_GET_LENGTH(self) >= width)
10760 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761
Victor Stinnerc4b49542011-12-11 22:44:26 +010010762 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763 left = marg / 2 + (marg & width & 1);
10764
Victor Stinner9310abb2011-10-05 00:59:23 +020010765 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766}
10767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768/* This function assumes that str1 and str2 are readied by the caller. */
10769
Marc-André Lemburge5034372000-08-08 08:04:29 +000010770static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010771unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010772{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010773#define COMPARE(TYPE1, TYPE2) \
10774 do { \
10775 TYPE1* p1 = (TYPE1 *)data1; \
10776 TYPE2* p2 = (TYPE2 *)data2; \
10777 TYPE1* end = p1 + len; \
10778 Py_UCS4 c1, c2; \
10779 for (; p1 != end; p1++, p2++) { \
10780 c1 = *p1; \
10781 c2 = *p2; \
10782 if (c1 != c2) \
10783 return (c1 < c2) ? -1 : 1; \
10784 } \
10785 } \
10786 while (0)
10787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 int kind1, kind2;
10789 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010790 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010792 kind1 = PyUnicode_KIND(str1);
10793 kind2 = PyUnicode_KIND(str2);
10794 data1 = PyUnicode_DATA(str1);
10795 data2 = PyUnicode_DATA(str2);
10796 len1 = PyUnicode_GET_LENGTH(str1);
10797 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010798 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010799
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010800 switch(kind1) {
10801 case PyUnicode_1BYTE_KIND:
10802 {
10803 switch(kind2) {
10804 case PyUnicode_1BYTE_KIND:
10805 {
10806 int cmp = memcmp(data1, data2, len);
10807 /* normalize result of memcmp() into the range [-1; 1] */
10808 if (cmp < 0)
10809 return -1;
10810 if (cmp > 0)
10811 return 1;
10812 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010813 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010814 case PyUnicode_2BYTE_KIND:
10815 COMPARE(Py_UCS1, Py_UCS2);
10816 break;
10817 case PyUnicode_4BYTE_KIND:
10818 COMPARE(Py_UCS1, Py_UCS4);
10819 break;
10820 default:
10821 assert(0);
10822 }
10823 break;
10824 }
10825 case PyUnicode_2BYTE_KIND:
10826 {
10827 switch(kind2) {
10828 case PyUnicode_1BYTE_KIND:
10829 COMPARE(Py_UCS2, Py_UCS1);
10830 break;
10831 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010832 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010833 COMPARE(Py_UCS2, Py_UCS2);
10834 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010835 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010836 case PyUnicode_4BYTE_KIND:
10837 COMPARE(Py_UCS2, Py_UCS4);
10838 break;
10839 default:
10840 assert(0);
10841 }
10842 break;
10843 }
10844 case PyUnicode_4BYTE_KIND:
10845 {
10846 switch(kind2) {
10847 case PyUnicode_1BYTE_KIND:
10848 COMPARE(Py_UCS4, Py_UCS1);
10849 break;
10850 case PyUnicode_2BYTE_KIND:
10851 COMPARE(Py_UCS4, Py_UCS2);
10852 break;
10853 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010854 {
10855#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10856 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10857 /* normalize result of wmemcmp() into the range [-1; 1] */
10858 if (cmp < 0)
10859 return -1;
10860 if (cmp > 0)
10861 return 1;
10862#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010863 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010864#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010865 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010866 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010867 default:
10868 assert(0);
10869 }
10870 break;
10871 }
10872 default:
10873 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010874 }
10875
Victor Stinner770e19e2012-10-04 22:59:45 +020010876 if (len1 == len2)
10877 return 0;
10878 if (len1 < len2)
10879 return -1;
10880 else
10881 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010882
10883#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010884}
10885
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010886Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010887unicode_compare_eq(PyObject *str1, PyObject *str2)
10888{
10889 int kind;
10890 void *data1, *data2;
10891 Py_ssize_t len;
10892 int cmp;
10893
Victor Stinnere5567ad2012-10-23 02:48:49 +020010894 len = PyUnicode_GET_LENGTH(str1);
10895 if (PyUnicode_GET_LENGTH(str2) != len)
10896 return 0;
10897 kind = PyUnicode_KIND(str1);
10898 if (PyUnicode_KIND(str2) != kind)
10899 return 0;
10900 data1 = PyUnicode_DATA(str1);
10901 data2 = PyUnicode_DATA(str2);
10902
10903 cmp = memcmp(data1, data2, len * kind);
10904 return (cmp == 0);
10905}
10906
10907
Alexander Belopolsky40018472011-02-26 01:02:56 +000010908int
10909PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010911 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10912 if (PyUnicode_READY(left) == -1 ||
10913 PyUnicode_READY(right) == -1)
10914 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010915
10916 /* a string is equal to itself */
10917 if (left == right)
10918 return 0;
10919
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010920 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010922 PyErr_Format(PyExc_TypeError,
10923 "Can't compare %.100s and %.100s",
10924 left->ob_type->tp_name,
10925 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010926 return -1;
10927}
10928
Martin v. Löwis5b222132007-06-10 09:51:05 +000010929int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010930_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10931{
10932 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10933 if (right_str == NULL)
10934 return -1;
10935 return PyUnicode_Compare(left, right_str);
10936}
10937
10938int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010939PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10940{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 Py_ssize_t i;
10942 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010943 Py_UCS4 chr;
10944
Victor Stinner910337b2011-10-03 03:20:16 +020010945 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 if (PyUnicode_READY(uni) == -1)
10947 return -1;
10948 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010949 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010950 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010951 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010952 size_t len, len2 = strlen(str);
10953 int cmp;
10954
10955 len = Py_MIN(len1, len2);
10956 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010957 if (cmp != 0) {
10958 if (cmp < 0)
10959 return -1;
10960 else
10961 return 1;
10962 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010963 if (len1 > len2)
10964 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010965 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010966 return -1; /* str is longer */
10967 return 0;
10968 }
10969 else {
10970 void *data = PyUnicode_DATA(uni);
10971 /* Compare Unicode string and source character set string */
10972 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010973 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010974 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10975 /* This check keeps Python strings that end in '\0' from comparing equal
10976 to C strings identical up to that point. */
10977 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10978 return 1; /* uni is longer */
10979 if (str[i])
10980 return -1; /* str is longer */
10981 return 0;
10982 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010983}
10984
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010985
Benjamin Peterson29060642009-01-31 22:14:21 +000010986#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010987 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010988
Alexander Belopolsky40018472011-02-26 01:02:56 +000010989PyObject *
10990PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010991{
10992 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010993 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010994
Victor Stinnere5567ad2012-10-23 02:48:49 +020010995 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10996 Py_RETURN_NOTIMPLEMENTED;
10997
10998 if (PyUnicode_READY(left) == -1 ||
10999 PyUnicode_READY(right) == -1)
11000 return NULL;
11001
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011002 if (left == right) {
11003 switch (op) {
11004 case Py_EQ:
11005 case Py_LE:
11006 case Py_GE:
11007 /* a string is equal to itself */
11008 v = Py_True;
11009 break;
11010 case Py_NE:
11011 case Py_LT:
11012 case Py_GT:
11013 v = Py_False;
11014 break;
11015 default:
11016 PyErr_BadArgument();
11017 return NULL;
11018 }
11019 }
11020 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011021 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011022 result ^= (op == Py_NE);
11023 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011024 }
11025 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011026 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011027
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011028 /* Convert the return value to a Boolean */
11029 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011030 case Py_LE:
11031 v = TEST_COND(result <= 0);
11032 break;
11033 case Py_GE:
11034 v = TEST_COND(result >= 0);
11035 break;
11036 case Py_LT:
11037 v = TEST_COND(result == -1);
11038 break;
11039 case Py_GT:
11040 v = TEST_COND(result == 1);
11041 break;
11042 default:
11043 PyErr_BadArgument();
11044 return NULL;
11045 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011046 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011047 Py_INCREF(v);
11048 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011049}
11050
Alexander Belopolsky40018472011-02-26 01:02:56 +000011051int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011052_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11053{
11054 return unicode_eq(aa, bb);
11055}
11056
11057int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011058PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011059{
Victor Stinner77282cb2013-04-14 19:22:47 +020011060 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011061 void *buf1, *buf2;
11062 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011063 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011064
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011065 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011066 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011067 "'in <string>' requires string as left operand, not %.100s",
11068 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011069 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011070 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011071 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011072 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011073 if (ensure_unicode(str) < 0)
11074 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011076 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011077 kind2 = PyUnicode_KIND(substr);
11078 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011079 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011081 len2 = PyUnicode_GET_LENGTH(substr);
11082 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011083 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011084 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011085 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011086 if (len2 == 1) {
11087 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11088 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011089 return result;
11090 }
11091 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011092 buf2 = _PyUnicode_AsKind(substr, kind1);
11093 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011094 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011095 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011096
Victor Stinner77282cb2013-04-14 19:22:47 +020011097 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011098 case PyUnicode_1BYTE_KIND:
11099 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11100 break;
11101 case PyUnicode_2BYTE_KIND:
11102 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11103 break;
11104 case PyUnicode_4BYTE_KIND:
11105 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11106 break;
11107 default:
11108 result = -1;
11109 assert(0);
11110 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011111
Victor Stinner77282cb2013-04-14 19:22:47 +020011112 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011113 PyMem_Free(buf2);
11114
Guido van Rossum403d68b2000-03-13 15:55:09 +000011115 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011116}
11117
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118/* Concat to string or Unicode object giving a new Unicode object. */
11119
Alexander Belopolsky40018472011-02-26 01:02:56 +000011120PyObject *
11121PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011123 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011124 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011125 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011127 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11128 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129
11130 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011131 if (left == unicode_empty)
11132 return PyUnicode_FromObject(right);
11133 if (right == unicode_empty)
11134 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011136 left_len = PyUnicode_GET_LENGTH(left);
11137 right_len = PyUnicode_GET_LENGTH(right);
11138 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011139 PyErr_SetString(PyExc_OverflowError,
11140 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011141 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011142 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011143 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011144
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011145 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11146 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011147 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011150 result = PyUnicode_New(new_len, maxchar);
11151 if (result == NULL)
11152 return NULL;
11153 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11154 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11155 assert(_PyUnicode_CheckConsistency(result, 1));
11156 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157}
11158
Walter Dörwald1ab83302007-05-18 17:15:44 +000011159void
Victor Stinner23e56682011-10-03 03:54:37 +020011160PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011161{
Victor Stinner23e56682011-10-03 03:54:37 +020011162 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011163 Py_UCS4 maxchar, maxchar2;
11164 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011165
11166 if (p_left == NULL) {
11167 if (!PyErr_Occurred())
11168 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011169 return;
11170 }
Victor Stinner23e56682011-10-03 03:54:37 +020011171 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011172 if (right == NULL || left == NULL
11173 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011174 if (!PyErr_Occurred())
11175 PyErr_BadInternalCall();
11176 goto error;
11177 }
11178
Benjamin Petersonbac79492012-01-14 13:34:47 -050011179 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011180 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011181 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011182 goto error;
11183
Victor Stinner488fa492011-12-12 00:01:39 +010011184 /* Shortcuts */
11185 if (left == unicode_empty) {
11186 Py_DECREF(left);
11187 Py_INCREF(right);
11188 *p_left = right;
11189 return;
11190 }
11191 if (right == unicode_empty)
11192 return;
11193
11194 left_len = PyUnicode_GET_LENGTH(left);
11195 right_len = PyUnicode_GET_LENGTH(right);
11196 if (left_len > PY_SSIZE_T_MAX - right_len) {
11197 PyErr_SetString(PyExc_OverflowError,
11198 "strings are too large to concat");
11199 goto error;
11200 }
11201 new_len = left_len + right_len;
11202
11203 if (unicode_modifiable(left)
11204 && PyUnicode_CheckExact(right)
11205 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011206 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11207 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011208 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011209 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011210 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11211 {
11212 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011213 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011214 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011215
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011216 /* copy 'right' into the newly allocated area of 'left' */
11217 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011218 }
Victor Stinner488fa492011-12-12 00:01:39 +010011219 else {
11220 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11221 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011222 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011223
Victor Stinner488fa492011-12-12 00:01:39 +010011224 /* Concat the two Unicode strings */
11225 res = PyUnicode_New(new_len, maxchar);
11226 if (res == NULL)
11227 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011228 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11229 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011230 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011231 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011232 }
11233 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011234 return;
11235
11236error:
Victor Stinner488fa492011-12-12 00:01:39 +010011237 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011238}
11239
11240void
11241PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11242{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011243 PyUnicode_Append(pleft, right);
11244 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011245}
11246
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011247/*
11248Wraps stringlib_parse_args_finds() and additionally ensures that the
11249first argument is a unicode object.
11250*/
11251
11252Py_LOCAL_INLINE(int)
11253parse_args_finds_unicode(const char * function_name, PyObject *args,
11254 PyObject **substring,
11255 Py_ssize_t *start, Py_ssize_t *end)
11256{
11257 if(stringlib_parse_args_finds(function_name, args, substring,
11258 start, end)) {
11259 if (ensure_unicode(*substring) < 0)
11260 return 0;
11261 return 1;
11262 }
11263 return 0;
11264}
11265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011266PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011267 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011269Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011270string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011271interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272
11273static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011274unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011276 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011277 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011278 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011280 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011281 void *buf1, *buf2;
11282 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011284 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011285 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 kind1 = PyUnicode_KIND(self);
11288 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011289 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011290 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011292 len1 = PyUnicode_GET_LENGTH(self);
11293 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011295 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011296 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011297
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011298 buf1 = PyUnicode_DATA(self);
11299 buf2 = PyUnicode_DATA(substring);
11300 if (kind2 != kind1) {
11301 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011302 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011303 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011304 }
11305 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 case PyUnicode_1BYTE_KIND:
11307 iresult = ucs1lib_count(
11308 ((Py_UCS1*)buf1) + start, end - start,
11309 buf2, len2, PY_SSIZE_T_MAX
11310 );
11311 break;
11312 case PyUnicode_2BYTE_KIND:
11313 iresult = ucs2lib_count(
11314 ((Py_UCS2*)buf1) + start, end - start,
11315 buf2, len2, PY_SSIZE_T_MAX
11316 );
11317 break;
11318 case PyUnicode_4BYTE_KIND:
11319 iresult = ucs4lib_count(
11320 ((Py_UCS4*)buf1) + start, end - start,
11321 buf2, len2, PY_SSIZE_T_MAX
11322 );
11323 break;
11324 default:
11325 assert(0); iresult = 0;
11326 }
11327
11328 result = PyLong_FromSsize_t(iresult);
11329
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011330 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333 return result;
11334}
11335
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011336PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011337 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011339Encode S using the codec registered for encoding. Default encoding\n\
11340is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011341handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011342a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11343'xmlcharrefreplace' as well as any other name registered with\n\
11344codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345
11346static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011347unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011349 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350 char *encoding = NULL;
11351 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011352
Benjamin Peterson308d6372009-09-18 21:42:35 +000011353 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11354 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011356 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011357}
11358
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011359PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011360 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361\n\
11362Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011363If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364
11365static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011366unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011368 Py_ssize_t i, j, line_pos, src_len, incr;
11369 Py_UCS4 ch;
11370 PyObject *u;
11371 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011372 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011374 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011375 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376
Ezio Melotti745d54d2013-11-16 19:10:57 +020011377 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11378 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380
Antoine Pitrou22425222011-10-04 19:10:51 +020011381 if (PyUnicode_READY(self) == -1)
11382 return NULL;
11383
Thomas Wouters7e474022000-07-16 12:04:32 +000011384 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011385 src_len = PyUnicode_GET_LENGTH(self);
11386 i = j = line_pos = 0;
11387 kind = PyUnicode_KIND(self);
11388 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011389 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011390 for (; i < src_len; i++) {
11391 ch = PyUnicode_READ(kind, src_data, i);
11392 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011393 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011394 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011395 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011396 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011397 goto overflow;
11398 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011399 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011400 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011403 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011404 goto overflow;
11405 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011407 if (ch == '\n' || ch == '\r')
11408 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011410 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011411 if (!found)
11412 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011413
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011415 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416 if (!u)
11417 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011418 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419
Antoine Pitroue71d5742011-10-04 15:55:09 +020011420 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421
Antoine Pitroue71d5742011-10-04 15:55:09 +020011422 for (; i < src_len; i++) {
11423 ch = PyUnicode_READ(kind, src_data, i);
11424 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011425 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011426 incr = tabsize - (line_pos % tabsize);
11427 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011428 FILL(kind, dest_data, ' ', j, incr);
11429 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011430 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011431 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011432 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011433 line_pos++;
11434 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011435 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011436 if (ch == '\n' || ch == '\r')
11437 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011439 }
11440 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011441 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011442
Antoine Pitroue71d5742011-10-04 15:55:09 +020011443 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011444 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11445 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446}
11447
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011448PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011449 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450\n\
11451Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011452such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453arguments start and end are interpreted as in slice notation.\n\
11454\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011455Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456
11457static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011460 /* initialize variables to prevent gcc warning */
11461 PyObject *substring = NULL;
11462 Py_ssize_t start = 0;
11463 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011464 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011466 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011469 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011472 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474 if (result == -2)
11475 return NULL;
11476
Christian Heimes217cfd12007-12-02 14:31:20 +000011477 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478}
11479
11480static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011481unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011483 void *data;
11484 enum PyUnicode_Kind kind;
11485 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011486
11487 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11488 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011490 }
11491 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11492 PyErr_SetString(PyExc_IndexError, "string index out of range");
11493 return NULL;
11494 }
11495 kind = PyUnicode_KIND(self);
11496 data = PyUnicode_DATA(self);
11497 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011498 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499}
11500
Guido van Rossumc2504932007-09-18 19:42:40 +000011501/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011502 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011503static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011504unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505{
Guido van Rossumc2504932007-09-18 19:42:40 +000011506 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011507 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011508
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011509#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011510 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011511#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 if (_PyUnicode_HASH(self) != -1)
11513 return _PyUnicode_HASH(self);
11514 if (PyUnicode_READY(self) == -1)
11515 return -1;
11516 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011517 /*
11518 We make the hash of the empty string be 0, rather than using
11519 (prefix ^ suffix), since this slightly obfuscates the hash secret
11520 */
11521 if (len == 0) {
11522 _PyUnicode_HASH(self) = 0;
11523 return 0;
11524 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011525 x = _Py_HashBytes(PyUnicode_DATA(self),
11526 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011527 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011528 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529}
11530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011531PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011532 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011534Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535
11536static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011539 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011540 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011541 PyObject *substring = NULL;
11542 Py_ssize_t start = 0;
11543 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011545 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011548 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011549 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011550
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011551 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 if (result == -2)
11554 return NULL;
11555
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556 if (result < 0) {
11557 PyErr_SetString(PyExc_ValueError, "substring not found");
11558 return NULL;
11559 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011560
Christian Heimes217cfd12007-12-02 14:31:20 +000011561 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562}
11563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011564PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011565 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011567Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011568at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569
11570static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011571unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011573 Py_ssize_t i, length;
11574 int kind;
11575 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576 int cased;
11577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011578 if (PyUnicode_READY(self) == -1)
11579 return NULL;
11580 length = PyUnicode_GET_LENGTH(self);
11581 kind = PyUnicode_KIND(self);
11582 data = PyUnicode_DATA(self);
11583
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011585 if (length == 1)
11586 return PyBool_FromLong(
11587 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011589 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011591 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011592
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011594 for (i = 0; i < length; i++) {
11595 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011596
Benjamin Peterson29060642009-01-31 22:14:21 +000011597 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11598 return PyBool_FromLong(0);
11599 else if (!cased && Py_UNICODE_ISLOWER(ch))
11600 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011602 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603}
11604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011605PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011606 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011608Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011609at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610
11611static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011612unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 Py_ssize_t i, length;
11615 int kind;
11616 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617 int cased;
11618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619 if (PyUnicode_READY(self) == -1)
11620 return NULL;
11621 length = PyUnicode_GET_LENGTH(self);
11622 kind = PyUnicode_KIND(self);
11623 data = PyUnicode_DATA(self);
11624
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 if (length == 1)
11627 return PyBool_FromLong(
11628 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011630 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011632 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011633
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 for (i = 0; i < length; i++) {
11636 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011637
Benjamin Peterson29060642009-01-31 22:14:21 +000011638 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11639 return PyBool_FromLong(0);
11640 else if (!cased && Py_UNICODE_ISUPPER(ch))
11641 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011643 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644}
11645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011646PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011647 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011649Return True if S is a titlecased string and there is at least one\n\
11650character in S, i.e. upper- and titlecase characters may only\n\
11651follow uncased characters and lowercase characters only cased ones.\n\
11652Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653
11654static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011655unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 Py_ssize_t i, length;
11658 int kind;
11659 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660 int cased, previous_is_cased;
11661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 if (PyUnicode_READY(self) == -1)
11663 return NULL;
11664 length = PyUnicode_GET_LENGTH(self);
11665 kind = PyUnicode_KIND(self);
11666 data = PyUnicode_DATA(self);
11667
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011669 if (length == 1) {
11670 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11671 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11672 (Py_UNICODE_ISUPPER(ch) != 0));
11673 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011675 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011677 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011678
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679 cased = 0;
11680 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681 for (i = 0; i < length; i++) {
11682 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011683
Benjamin Peterson29060642009-01-31 22:14:21 +000011684 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11685 if (previous_is_cased)
11686 return PyBool_FromLong(0);
11687 previous_is_cased = 1;
11688 cased = 1;
11689 }
11690 else if (Py_UNICODE_ISLOWER(ch)) {
11691 if (!previous_is_cased)
11692 return PyBool_FromLong(0);
11693 previous_is_cased = 1;
11694 cased = 1;
11695 }
11696 else
11697 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011699 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700}
11701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011702PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011703 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011705Return True if all characters in S are whitespace\n\
11706and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707
11708static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011709unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 Py_ssize_t i, length;
11712 int kind;
11713 void *data;
11714
11715 if (PyUnicode_READY(self) == -1)
11716 return NULL;
11717 length = PyUnicode_GET_LENGTH(self);
11718 kind = PyUnicode_KIND(self);
11719 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011722 if (length == 1)
11723 return PyBool_FromLong(
11724 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011726 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011727 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011728 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730 for (i = 0; i < length; i++) {
11731 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011732 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011733 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011735 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736}
11737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011738PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011739 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011740\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011741Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011742and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011743
11744static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011745unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011746{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 Py_ssize_t i, length;
11748 int kind;
11749 void *data;
11750
11751 if (PyUnicode_READY(self) == -1)
11752 return NULL;
11753 length = PyUnicode_GET_LENGTH(self);
11754 kind = PyUnicode_KIND(self);
11755 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011756
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011757 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 if (length == 1)
11759 return PyBool_FromLong(
11760 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011761
11762 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011764 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 for (i = 0; i < length; i++) {
11767 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011768 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011769 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011770 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011771}
11772
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011773PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011775\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011776Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011777and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011778
11779static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011780unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011781{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 int kind;
11783 void *data;
11784 Py_ssize_t len, i;
11785
11786 if (PyUnicode_READY(self) == -1)
11787 return NULL;
11788
11789 kind = PyUnicode_KIND(self);
11790 data = PyUnicode_DATA(self);
11791 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011792
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011793 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 if (len == 1) {
11795 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11796 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11797 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011798
11799 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011801 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 for (i = 0; i < len; i++) {
11804 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011805 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011806 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011807 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011808 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011809}
11810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011811PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011812 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011814Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011815False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816
11817static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011818unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820 Py_ssize_t i, length;
11821 int kind;
11822 void *data;
11823
11824 if (PyUnicode_READY(self) == -1)
11825 return NULL;
11826 length = PyUnicode_GET_LENGTH(self);
11827 kind = PyUnicode_KIND(self);
11828 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 if (length == 1)
11832 return PyBool_FromLong(
11833 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011835 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011837 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011839 for (i = 0; i < length; i++) {
11840 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011843 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844}
11845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011846PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011847 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011849Return True if all characters in S are digits\n\
11850and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851
11852static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011853unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 Py_ssize_t i, length;
11856 int kind;
11857 void *data;
11858
11859 if (PyUnicode_READY(self) == -1)
11860 return NULL;
11861 length = PyUnicode_GET_LENGTH(self);
11862 kind = PyUnicode_KIND(self);
11863 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 if (length == 1) {
11867 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11868 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11869 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011871 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011873 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 for (i = 0; i < length; i++) {
11876 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011879 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880}
11881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011882PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011885Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011886False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887
11888static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011889unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 Py_ssize_t i, length;
11892 int kind;
11893 void *data;
11894
11895 if (PyUnicode_READY(self) == -1)
11896 return NULL;
11897 length = PyUnicode_GET_LENGTH(self);
11898 kind = PyUnicode_KIND(self);
11899 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 if (length == 1)
11903 return PyBool_FromLong(
11904 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011906 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011907 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011908 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910 for (i = 0; i < length; i++) {
11911 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011912 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011914 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915}
11916
Martin v. Löwis47383402007-08-15 07:32:56 +000011917int
11918PyUnicode_IsIdentifier(PyObject *self)
11919{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 int kind;
11921 void *data;
11922 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011923 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 if (PyUnicode_READY(self) == -1) {
11926 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011927 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 }
11929
11930 /* Special case for empty strings */
11931 if (PyUnicode_GET_LENGTH(self) == 0)
11932 return 0;
11933 kind = PyUnicode_KIND(self);
11934 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011935
11936 /* PEP 3131 says that the first character must be in
11937 XID_Start and subsequent characters in XID_Continue,
11938 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011939 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011940 letters, digits, underscore). However, given the current
11941 definition of XID_Start and XID_Continue, it is sufficient
11942 to check just for these, except that _ must be allowed
11943 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011945 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011946 return 0;
11947
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011948 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011949 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011950 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011951 return 1;
11952}
11953
11954PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011955 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011956\n\
11957Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011958to the language definition.\n\
11959\n\
11960Use keyword.iskeyword() to test for reserved identifiers\n\
11961such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011962
11963static PyObject*
11964unicode_isidentifier(PyObject *self)
11965{
11966 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11967}
11968
Georg Brandl559e5d72008-06-11 18:37:52 +000011969PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011970 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011971\n\
11972Return True if all characters in S are considered\n\
11973printable in repr() or S is empty, False otherwise.");
11974
11975static PyObject*
11976unicode_isprintable(PyObject *self)
11977{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 Py_ssize_t i, length;
11979 int kind;
11980 void *data;
11981
11982 if (PyUnicode_READY(self) == -1)
11983 return NULL;
11984 length = PyUnicode_GET_LENGTH(self);
11985 kind = PyUnicode_KIND(self);
11986 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011987
11988 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 if (length == 1)
11990 return PyBool_FromLong(
11991 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 for (i = 0; i < length; i++) {
11994 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011995 Py_RETURN_FALSE;
11996 }
11997 }
11998 Py_RETURN_TRUE;
11999}
12000
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012001PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012002 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003\n\
12004Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012005iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006
12007static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012008unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012010 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011}
12012
Martin v. Löwis18e16552006-02-15 17:27:45 +000012013static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012014unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 if (PyUnicode_READY(self) == -1)
12017 return -1;
12018 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019}
12020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012021PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012022 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012023\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012024Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012025done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026
12027static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012028unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012030 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 Py_UCS4 fillchar = ' ';
12032
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012033 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034 return NULL;
12035
Benjamin Petersonbac79492012-01-14 13:34:47 -050012036 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012037 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038
Victor Stinnerc4b49542011-12-11 22:44:26 +010012039 if (PyUnicode_GET_LENGTH(self) >= width)
12040 return unicode_result_unchanged(self);
12041
12042 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043}
12044
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012045PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012046 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012048Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049
12050static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012051unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012053 if (PyUnicode_READY(self) == -1)
12054 return NULL;
12055 if (PyUnicode_IS_ASCII(self))
12056 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012057 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058}
12059
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012060#define LEFTSTRIP 0
12061#define RIGHTSTRIP 1
12062#define BOTHSTRIP 2
12063
12064/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012065static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012066
12067#define STRIPNAME(i) (stripformat[i]+3)
12068
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012069/* externally visible for str.strip(unicode) */
12070PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012071_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012072{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073 void *data;
12074 int kind;
12075 Py_ssize_t i, j, len;
12076 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012077 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012078
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12080 return NULL;
12081
12082 kind = PyUnicode_KIND(self);
12083 data = PyUnicode_DATA(self);
12084 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012085 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012086 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12087 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012088 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012089
Benjamin Peterson14339b62009-01-31 16:36:08 +000012090 i = 0;
12091 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012092 while (i < len) {
12093 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12094 if (!BLOOM(sepmask, ch))
12095 break;
12096 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12097 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012098 i++;
12099 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012100 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012101
Benjamin Peterson14339b62009-01-31 16:36:08 +000012102 j = len;
12103 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012104 j--;
12105 while (j >= i) {
12106 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12107 if (!BLOOM(sepmask, ch))
12108 break;
12109 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12110 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012111 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012112 }
12113
Benjamin Peterson29060642009-01-31 22:14:21 +000012114 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012115 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012116
Victor Stinner7931d9a2011-11-04 00:22:48 +010012117 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118}
12119
12120PyObject*
12121PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12122{
12123 unsigned char *data;
12124 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012125 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126
Victor Stinnerde636f32011-10-01 03:55:54 +020012127 if (PyUnicode_READY(self) == -1)
12128 return NULL;
12129
Victor Stinner684d5fd2012-05-03 02:32:34 +020012130 length = PyUnicode_GET_LENGTH(self);
12131 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012132
Victor Stinner684d5fd2012-05-03 02:32:34 +020012133 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012134 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012135
Victor Stinnerde636f32011-10-01 03:55:54 +020012136 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012137 PyErr_SetString(PyExc_IndexError, "string index out of range");
12138 return NULL;
12139 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012140 if (start >= length || end < start)
12141 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012142
Victor Stinner684d5fd2012-05-03 02:32:34 +020012143 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012144 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012145 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012146 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012147 }
12148 else {
12149 kind = PyUnicode_KIND(self);
12150 data = PyUnicode_1BYTE_DATA(self);
12151 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012152 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012153 length);
12154 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156
12157static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012158do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012160 Py_ssize_t len, i, j;
12161
12162 if (PyUnicode_READY(self) == -1)
12163 return NULL;
12164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012166
Victor Stinnercc7af722013-04-09 22:39:24 +020012167 if (PyUnicode_IS_ASCII(self)) {
12168 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12169
12170 i = 0;
12171 if (striptype != RIGHTSTRIP) {
12172 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012173 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012174 if (!_Py_ascii_whitespace[ch])
12175 break;
12176 i++;
12177 }
12178 }
12179
12180 j = len;
12181 if (striptype != LEFTSTRIP) {
12182 j--;
12183 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012184 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012185 if (!_Py_ascii_whitespace[ch])
12186 break;
12187 j--;
12188 }
12189 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012190 }
12191 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012192 else {
12193 int kind = PyUnicode_KIND(self);
12194 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012195
Victor Stinnercc7af722013-04-09 22:39:24 +020012196 i = 0;
12197 if (striptype != RIGHTSTRIP) {
12198 while (i < len) {
12199 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12200 if (!Py_UNICODE_ISSPACE(ch))
12201 break;
12202 i++;
12203 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012204 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012205
12206 j = len;
12207 if (striptype != LEFTSTRIP) {
12208 j--;
12209 while (j >= i) {
12210 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12211 if (!Py_UNICODE_ISSPACE(ch))
12212 break;
12213 j--;
12214 }
12215 j++;
12216 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012217 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012218
Victor Stinner7931d9a2011-11-04 00:22:48 +010012219 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220}
12221
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012222
12223static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012224do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012225{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012226 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012227
Serhiy Storchakac6792272013-10-19 21:03:34 +030012228 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012229 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012230
Benjamin Peterson14339b62009-01-31 16:36:08 +000012231 if (sep != NULL && sep != Py_None) {
12232 if (PyUnicode_Check(sep))
12233 return _PyUnicode_XStrip(self, striptype, sep);
12234 else {
12235 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012236 "%s arg must be None or str",
12237 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012238 return NULL;
12239 }
12240 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012241
Benjamin Peterson14339b62009-01-31 16:36:08 +000012242 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012243}
12244
12245
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012246PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012247 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012248\n\
12249Return a copy of the string S with leading and trailing\n\
12250whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012251If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012252
12253static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012254unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012255{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012256 if (PyTuple_GET_SIZE(args) == 0)
12257 return do_strip(self, BOTHSTRIP); /* Common case */
12258 else
12259 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012260}
12261
12262
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012263PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012264 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012265\n\
12266Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012267If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012268
12269static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012270unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012271{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012272 if (PyTuple_GET_SIZE(args) == 0)
12273 return do_strip(self, LEFTSTRIP); /* Common case */
12274 else
12275 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012276}
12277
12278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012279PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012280 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012281\n\
12282Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012283If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012284
12285static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012286unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012287{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012288 if (PyTuple_GET_SIZE(args) == 0)
12289 return do_strip(self, RIGHTSTRIP); /* Common case */
12290 else
12291 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012292}
12293
12294
Guido van Rossumd57fd912000-03-10 22:53:23 +000012295static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012296unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012298 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300
Serhiy Storchaka05997252013-01-26 12:14:02 +020012301 if (len < 1)
12302 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303
Victor Stinnerc4b49542011-12-11 22:44:26 +010012304 /* no repeat, return original string */
12305 if (len == 1)
12306 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012307
Benjamin Petersonbac79492012-01-14 13:34:47 -050012308 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309 return NULL;
12310
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012311 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012312 PyErr_SetString(PyExc_OverflowError,
12313 "repeated string is too long");
12314 return NULL;
12315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012317
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012318 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012319 if (!u)
12320 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012321 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 if (PyUnicode_GET_LENGTH(str) == 1) {
12324 const int kind = PyUnicode_KIND(str);
12325 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012326 if (kind == PyUnicode_1BYTE_KIND) {
12327 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012328 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012329 }
12330 else if (kind == PyUnicode_2BYTE_KIND) {
12331 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012332 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012333 ucs2[n] = fill_char;
12334 } else {
12335 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12336 assert(kind == PyUnicode_4BYTE_KIND);
12337 for (n = 0; n < len; ++n)
12338 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 }
12341 else {
12342 /* number of characters copied this far */
12343 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012344 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 char *to = (char *) PyUnicode_DATA(u);
12346 Py_MEMCPY(to, PyUnicode_DATA(str),
12347 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012348 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 n = (done <= nchars-done) ? done : nchars-done;
12350 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012351 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012352 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012353 }
12354
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012355 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012356 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012357}
12358
Alexander Belopolsky40018472011-02-26 01:02:56 +000012359PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012360PyUnicode_Replace(PyObject *str,
12361 PyObject *substr,
12362 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012363 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012365 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12366 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012367 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012368 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369}
12370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012371PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012372 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373\n\
12374Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012375old replaced by new. If the optional argument count is\n\
12376given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377
12378static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012379unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012381 PyObject *str1;
12382 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012383 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012385 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012387 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012388 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012389 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012390}
12391
Alexander Belopolsky40018472011-02-26 01:02:56 +000012392static PyObject *
12393unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012395 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012396 Py_ssize_t isize;
12397 Py_ssize_t osize, squote, dquote, i, o;
12398 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012399 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012400 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012402 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012403 return NULL;
12404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012405 isize = PyUnicode_GET_LENGTH(unicode);
12406 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012408 /* Compute length of output, quote characters, and
12409 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012410 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 max = 127;
12412 squote = dquote = 0;
12413 ikind = PyUnicode_KIND(unicode);
12414 for (i = 0; i < isize; i++) {
12415 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012416 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012418 case '\'': squote++; break;
12419 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012420 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012421 incr = 2;
12422 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 default:
12424 /* Fast-path ASCII */
12425 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012426 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012427 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012428 ;
12429 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012432 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012434 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012435 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012436 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012438 if (osize > PY_SSIZE_T_MAX - incr) {
12439 PyErr_SetString(PyExc_OverflowError,
12440 "string is too long to generate repr");
12441 return NULL;
12442 }
12443 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 }
12445
12446 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012447 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012449 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012450 if (dquote)
12451 /* Both squote and dquote present. Use squote,
12452 and escape them */
12453 osize += squote;
12454 else
12455 quote = '"';
12456 }
Victor Stinner55c08782013-04-14 18:45:39 +020012457 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458
12459 repr = PyUnicode_New(osize, max);
12460 if (repr == NULL)
12461 return NULL;
12462 okind = PyUnicode_KIND(repr);
12463 odata = PyUnicode_DATA(repr);
12464
12465 PyUnicode_WRITE(okind, odata, 0, quote);
12466 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012467 if (unchanged) {
12468 _PyUnicode_FastCopyCharacters(repr, 1,
12469 unicode, 0,
12470 isize);
12471 }
12472 else {
12473 for (i = 0, o = 1; i < isize; i++) {
12474 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475
Victor Stinner55c08782013-04-14 18:45:39 +020012476 /* Escape quotes and backslashes */
12477 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012478 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012480 continue;
12481 }
12482
12483 /* Map special whitespace to '\t', \n', '\r' */
12484 if (ch == '\t') {
12485 PyUnicode_WRITE(okind, odata, o++, '\\');
12486 PyUnicode_WRITE(okind, odata, o++, 't');
12487 }
12488 else if (ch == '\n') {
12489 PyUnicode_WRITE(okind, odata, o++, '\\');
12490 PyUnicode_WRITE(okind, odata, o++, 'n');
12491 }
12492 else if (ch == '\r') {
12493 PyUnicode_WRITE(okind, odata, o++, '\\');
12494 PyUnicode_WRITE(okind, odata, o++, 'r');
12495 }
12496
12497 /* Map non-printable US ASCII to '\xhh' */
12498 else if (ch < ' ' || ch == 0x7F) {
12499 PyUnicode_WRITE(okind, odata, o++, '\\');
12500 PyUnicode_WRITE(okind, odata, o++, 'x');
12501 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12502 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12503 }
12504
12505 /* Copy ASCII characters as-is */
12506 else if (ch < 0x7F) {
12507 PyUnicode_WRITE(okind, odata, o++, ch);
12508 }
12509
12510 /* Non-ASCII characters */
12511 else {
12512 /* Map Unicode whitespace and control characters
12513 (categories Z* and C* except ASCII space)
12514 */
12515 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12516 PyUnicode_WRITE(okind, odata, o++, '\\');
12517 /* Map 8-bit characters to '\xhh' */
12518 if (ch <= 0xff) {
12519 PyUnicode_WRITE(okind, odata, o++, 'x');
12520 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12521 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12522 }
12523 /* Map 16-bit characters to '\uxxxx' */
12524 else if (ch <= 0xffff) {
12525 PyUnicode_WRITE(okind, odata, o++, 'u');
12526 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12527 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12528 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12529 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12530 }
12531 /* Map 21-bit characters to '\U00xxxxxx' */
12532 else {
12533 PyUnicode_WRITE(okind, odata, o++, 'U');
12534 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12535 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12536 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12537 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12538 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12539 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12540 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12541 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12542 }
12543 }
12544 /* Copy characters as-is */
12545 else {
12546 PyUnicode_WRITE(okind, odata, o++, ch);
12547 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012548 }
12549 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012552 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012553 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554}
12555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012556PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012557 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558\n\
12559Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012560such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561arguments start and end are interpreted as in slice notation.\n\
12562\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012563Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564
12565static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012568 /* initialize variables to prevent gcc warning */
12569 PyObject *substring = NULL;
12570 Py_ssize_t start = 0;
12571 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012572 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012574 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012575 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012577 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012579
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012580 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012582 if (result == -2)
12583 return NULL;
12584
Christian Heimes217cfd12007-12-02 14:31:20 +000012585 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586}
12587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012588PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012589 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012591Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592
12593static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012594unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012595{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012596 /* initialize variables to prevent gcc warning */
12597 PyObject *substring = NULL;
12598 Py_ssize_t start = 0;
12599 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012600 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012601
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012602 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012603 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012604
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012605 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012607
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012608 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 if (result == -2)
12611 return NULL;
12612
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613 if (result < 0) {
12614 PyErr_SetString(PyExc_ValueError, "substring not found");
12615 return NULL;
12616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012617
Christian Heimes217cfd12007-12-02 14:31:20 +000012618 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619}
12620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012621PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012622 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012624Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012625done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626
12627static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012628unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012630 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631 Py_UCS4 fillchar = ' ';
12632
Victor Stinnere9a29352011-10-01 02:14:59 +020012633 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012635
Benjamin Petersonbac79492012-01-14 13:34:47 -050012636 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637 return NULL;
12638
Victor Stinnerc4b49542011-12-11 22:44:26 +010012639 if (PyUnicode_GET_LENGTH(self) >= width)
12640 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641
Victor Stinnerc4b49542011-12-11 22:44:26 +010012642 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643}
12644
Alexander Belopolsky40018472011-02-26 01:02:56 +000012645PyObject *
12646PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012647{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012648 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012649 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012651 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652}
12653
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012654PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012655 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656\n\
12657Return a list of the words in S, using sep as the\n\
12658delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012659splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012660whitespace string is a separator and empty strings are\n\
12661removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662
12663static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012664unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012665{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012666 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012668 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012670 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12671 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672 return NULL;
12673
12674 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012675 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012676
12677 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012678 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012679
12680 PyErr_Format(PyExc_TypeError,
12681 "must be str or None, not %.100s",
12682 Py_TYPE(substring)->tp_name);
12683 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684}
12685
Thomas Wouters477c8d52006-05-27 19:21:47 +000012686PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012687PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012688{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012689 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012690 int kind1, kind2;
12691 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012692 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012693
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012694 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012695 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012696
Victor Stinner14f8f022011-10-05 20:58:25 +020012697 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012698 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 len1 = PyUnicode_GET_LENGTH(str_obj);
12700 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012701 if (kind1 < kind2 || len1 < len2) {
12702 _Py_INCREF_UNICODE_EMPTY();
12703 if (!unicode_empty)
12704 out = NULL;
12705 else {
12706 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12707 Py_DECREF(unicode_empty);
12708 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012709 return out;
12710 }
12711 buf1 = PyUnicode_DATA(str_obj);
12712 buf2 = PyUnicode_DATA(sep_obj);
12713 if (kind2 != kind1) {
12714 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12715 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012716 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012717 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012719 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012720 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012721 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12722 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12723 else
12724 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 break;
12726 case PyUnicode_2BYTE_KIND:
12727 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12728 break;
12729 case PyUnicode_4BYTE_KIND:
12730 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12731 break;
12732 default:
12733 assert(0);
12734 out = 0;
12735 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012736
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012737 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012739
12740 return out;
12741}
12742
12743
12744PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012745PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012746{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012747 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012748 int kind1, kind2;
12749 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012751
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012752 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012753 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012754
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012755 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012756 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 len1 = PyUnicode_GET_LENGTH(str_obj);
12758 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012759 if (kind1 < kind2 || len1 < len2) {
12760 _Py_INCREF_UNICODE_EMPTY();
12761 if (!unicode_empty)
12762 out = NULL;
12763 else {
12764 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12765 Py_DECREF(unicode_empty);
12766 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012767 return out;
12768 }
12769 buf1 = PyUnicode_DATA(str_obj);
12770 buf2 = PyUnicode_DATA(sep_obj);
12771 if (kind2 != kind1) {
12772 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12773 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012774 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012775 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012777 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012778 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012779 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12780 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12781 else
12782 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783 break;
12784 case PyUnicode_2BYTE_KIND:
12785 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12786 break;
12787 case PyUnicode_4BYTE_KIND:
12788 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12789 break;
12790 default:
12791 assert(0);
12792 out = 0;
12793 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012794
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012795 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012797
12798 return out;
12799}
12800
12801PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012802 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012803\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012804Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012805the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012806found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012807
12808static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012809unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012810{
Victor Stinner9310abb2011-10-05 00:59:23 +020012811 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012812}
12813
12814PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012815 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012816\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012817Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012818the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012819separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012820
12821static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012822unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012823{
Victor Stinner9310abb2011-10-05 00:59:23 +020012824 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012825}
12826
Alexander Belopolsky40018472011-02-26 01:02:56 +000012827PyObject *
12828PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012829{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012830 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012831 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012832
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012833 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012834}
12835
12836PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012837 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012838\n\
12839Return a list of the words in S, using sep as the\n\
12840delimiter string, starting at the end of the string and\n\
12841working to the front. If maxsplit is given, at most maxsplit\n\
12842splits are done. If sep is not specified, any whitespace string\n\
12843is a separator.");
12844
12845static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012846unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012847{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012848 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012849 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012850 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012851
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012852 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12853 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012854 return NULL;
12855
12856 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012857 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012858
12859 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012860 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012861
12862 PyErr_Format(PyExc_TypeError,
12863 "must be str or None, not %.100s",
12864 Py_TYPE(substring)->tp_name);
12865 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012866}
12867
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012868PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012869 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870\n\
12871Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012872Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012873is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874
12875static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012876unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012878 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012879 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012881 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12882 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883 return NULL;
12884
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012885 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886}
12887
12888static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012889PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012890{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012891 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892}
12893
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012894PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012895 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012896\n\
12897Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012898and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899
12900static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012901unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012903 if (PyUnicode_READY(self) == -1)
12904 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012905 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012906}
12907
Larry Hastings61272b72014-01-07 12:41:53 -080012908/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012909
Larry Hastings31826802013-10-19 00:09:25 -070012910@staticmethod
12911str.maketrans as unicode_maketrans
12912
12913 x: object
12914
12915 y: unicode=NULL
12916
12917 z: unicode=NULL
12918
12919 /
12920
12921Return a translation table usable for str.translate().
12922
12923If there is only one argument, it must be a dictionary mapping Unicode
12924ordinals (integers) or characters to Unicode ordinals, strings or None.
12925Character keys will be then converted to ordinals.
12926If there are two arguments, they must be strings of equal length, and
12927in the resulting dictionary, each character in x will be mapped to the
12928character at the same position in y. If there is a third argument, it
12929must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012930[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012931
Larry Hastings31826802013-10-19 00:09:25 -070012932static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012933unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030012934/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012935{
Georg Brandlceee0772007-11-27 23:48:05 +000012936 PyObject *new = NULL, *key, *value;
12937 Py_ssize_t i = 0;
12938 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012939
Georg Brandlceee0772007-11-27 23:48:05 +000012940 new = PyDict_New();
12941 if (!new)
12942 return NULL;
12943 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012944 int x_kind, y_kind, z_kind;
12945 void *x_data, *y_data, *z_data;
12946
Georg Brandlceee0772007-11-27 23:48:05 +000012947 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012948 if (!PyUnicode_Check(x)) {
12949 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12950 "be a string if there is a second argument");
12951 goto err;
12952 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012953 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012954 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12955 "arguments must have equal length");
12956 goto err;
12957 }
12958 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012959 x_kind = PyUnicode_KIND(x);
12960 y_kind = PyUnicode_KIND(y);
12961 x_data = PyUnicode_DATA(x);
12962 y_data = PyUnicode_DATA(y);
12963 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12964 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012965 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012966 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012967 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012968 if (!value) {
12969 Py_DECREF(key);
12970 goto err;
12971 }
Georg Brandlceee0772007-11-27 23:48:05 +000012972 res = PyDict_SetItem(new, key, value);
12973 Py_DECREF(key);
12974 Py_DECREF(value);
12975 if (res < 0)
12976 goto err;
12977 }
12978 /* create entries for deleting chars in z */
12979 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012980 z_kind = PyUnicode_KIND(z);
12981 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012982 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012983 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012984 if (!key)
12985 goto err;
12986 res = PyDict_SetItem(new, key, Py_None);
12987 Py_DECREF(key);
12988 if (res < 0)
12989 goto err;
12990 }
12991 }
12992 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012993 int kind;
12994 void *data;
12995
Georg Brandlceee0772007-11-27 23:48:05 +000012996 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012997 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012998 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12999 "to maketrans it must be a dict");
13000 goto err;
13001 }
13002 /* copy entries into the new dict, converting string keys to int keys */
13003 while (PyDict_Next(x, &i, &key, &value)) {
13004 if (PyUnicode_Check(key)) {
13005 /* convert string keys to integer keys */
13006 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013007 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013008 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13009 "table must be of length 1");
13010 goto err;
13011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013012 kind = PyUnicode_KIND(key);
13013 data = PyUnicode_DATA(key);
13014 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013015 if (!newkey)
13016 goto err;
13017 res = PyDict_SetItem(new, newkey, value);
13018 Py_DECREF(newkey);
13019 if (res < 0)
13020 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013021 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013022 /* just keep integer keys */
13023 if (PyDict_SetItem(new, key, value) < 0)
13024 goto err;
13025 } else {
13026 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13027 "be strings or integers");
13028 goto err;
13029 }
13030 }
13031 }
13032 return new;
13033 err:
13034 Py_DECREF(new);
13035 return NULL;
13036}
13037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013038PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013039 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013040\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013041Return a copy of the string S in which each character has been mapped\n\
13042through the given translation table. The table must implement\n\
13043lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13044mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13045this operation raises LookupError, the character is left untouched.\n\
13046Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013047
13048static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013049unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013050{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013051 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013052}
13053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013054PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013055 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013056\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013057Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013058
13059static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013060unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013061{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013062 if (PyUnicode_READY(self) == -1)
13063 return NULL;
13064 if (PyUnicode_IS_ASCII(self))
13065 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013066 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067}
13068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013069PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013070 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013072Pad a numeric string S with zeros on the left, to fill a field\n\
13073of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013074
13075static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013076unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013077{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013078 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013079 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013080 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013081 int kind;
13082 void *data;
13083 Py_UCS4 chr;
13084
Martin v. Löwis18e16552006-02-15 17:27:45 +000013085 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086 return NULL;
13087
Benjamin Petersonbac79492012-01-14 13:34:47 -050013088 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013089 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013090
Victor Stinnerc4b49542011-12-11 22:44:26 +010013091 if (PyUnicode_GET_LENGTH(self) >= width)
13092 return unicode_result_unchanged(self);
13093
13094 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095
13096 u = pad(self, fill, 0, '0');
13097
Walter Dörwald068325e2002-04-15 13:36:47 +000013098 if (u == NULL)
13099 return NULL;
13100
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013101 kind = PyUnicode_KIND(u);
13102 data = PyUnicode_DATA(u);
13103 chr = PyUnicode_READ(kind, data, fill);
13104
13105 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013107 PyUnicode_WRITE(kind, data, 0, chr);
13108 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109 }
13110
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013111 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013112 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114
13115#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013116static PyObject *
13117unicode__decimal2ascii(PyObject *self)
13118{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013119 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013120}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121#endif
13122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013123PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013124 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013126Return True if S starts with the specified prefix, False otherwise.\n\
13127With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013128With optional end, stop comparing S at that position.\n\
13129prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130
13131static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013132unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013133 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013134{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013135 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013136 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013137 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013138 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013139 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140
Jesus Ceaac451502011-04-20 17:09:23 +020013141 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013142 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013143 if (PyTuple_Check(subobj)) {
13144 Py_ssize_t i;
13145 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013146 substring = PyTuple_GET_ITEM(subobj, i);
13147 if (!PyUnicode_Check(substring)) {
13148 PyErr_Format(PyExc_TypeError,
13149 "tuple for startswith must only contain str, "
13150 "not %.100s",
13151 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013152 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013153 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013154 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013155 if (result == -1)
13156 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013157 if (result) {
13158 Py_RETURN_TRUE;
13159 }
13160 }
13161 /* nothing matched */
13162 Py_RETURN_FALSE;
13163 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013164 if (!PyUnicode_Check(subobj)) {
13165 PyErr_Format(PyExc_TypeError,
13166 "startswith first arg must be str or "
13167 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013168 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013169 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013170 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013171 if (result == -1)
13172 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013173 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013174}
13175
13176
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013177PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013178 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013179\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013180Return True if S ends with the specified suffix, False otherwise.\n\
13181With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013182With optional end, stop comparing S at that position.\n\
13183suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013184
13185static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013186unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013187 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013188{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013189 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013190 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013191 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013192 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013193 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013194
Jesus Ceaac451502011-04-20 17:09:23 +020013195 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013196 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013197 if (PyTuple_Check(subobj)) {
13198 Py_ssize_t i;
13199 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013200 substring = PyTuple_GET_ITEM(subobj, i);
13201 if (!PyUnicode_Check(substring)) {
13202 PyErr_Format(PyExc_TypeError,
13203 "tuple for endswith must only contain str, "
13204 "not %.100s",
13205 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013206 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013207 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013208 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013209 if (result == -1)
13210 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013211 if (result) {
13212 Py_RETURN_TRUE;
13213 }
13214 }
13215 Py_RETURN_FALSE;
13216 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013217 if (!PyUnicode_Check(subobj)) {
13218 PyErr_Format(PyExc_TypeError,
13219 "endswith first arg must be str or "
13220 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013221 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013222 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013223 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013224 if (result == -1)
13225 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013226 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013227}
13228
Victor Stinner202fdca2012-05-07 12:47:02 +020013229Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013230_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013231{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013232 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13233 writer->data = PyUnicode_DATA(writer->buffer);
13234
13235 if (!writer->readonly) {
13236 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013237 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013238 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013239 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013240 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13241 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13242 writer->kind = PyUnicode_WCHAR_KIND;
13243 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13244
Victor Stinner8f674cc2013-04-17 23:02:17 +020013245 /* Copy-on-write mode: set buffer size to 0 so
13246 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13247 * next write. */
13248 writer->size = 0;
13249 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013250}
13251
Victor Stinnerd3f08822012-05-29 12:57:52 +020013252void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013253_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013254{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013255 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013256
13257 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013258 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013259
13260 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13261 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13262 writer->kind = PyUnicode_WCHAR_KIND;
13263 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013264}
13265
Victor Stinnerd3f08822012-05-29 12:57:52 +020013266int
13267_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13268 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013269{
13270 Py_ssize_t newlen;
13271 PyObject *newbuffer;
13272
Victor Stinnerca9381e2015-09-22 00:58:32 +020013273 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013274 assert((maxchar > writer->maxchar && length >= 0)
13275 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013276
Victor Stinner202fdca2012-05-07 12:47:02 +020013277 if (length > PY_SSIZE_T_MAX - writer->pos) {
13278 PyErr_NoMemory();
13279 return -1;
13280 }
13281 newlen = writer->pos + length;
13282
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013283 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013284
Victor Stinnerd3f08822012-05-29 12:57:52 +020013285 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013286 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013287 if (writer->overallocate
13288 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13289 /* overallocate to limit the number of realloc() */
13290 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013291 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013292 if (newlen < writer->min_length)
13293 newlen = writer->min_length;
13294
Victor Stinnerd3f08822012-05-29 12:57:52 +020013295 writer->buffer = PyUnicode_New(newlen, maxchar);
13296 if (writer->buffer == NULL)
13297 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013298 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013299 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013300 if (writer->overallocate
13301 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13302 /* overallocate to limit the number of realloc() */
13303 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013304 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013305 if (newlen < writer->min_length)
13306 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013307
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013308 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013309 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013310 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013311 newbuffer = PyUnicode_New(newlen, maxchar);
13312 if (newbuffer == NULL)
13313 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013314 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13315 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013316 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013317 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013318 }
13319 else {
13320 newbuffer = resize_compact(writer->buffer, newlen);
13321 if (newbuffer == NULL)
13322 return -1;
13323 }
13324 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013325 }
13326 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013327 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013328 newbuffer = PyUnicode_New(writer->size, maxchar);
13329 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013330 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013331 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13332 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013333 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013334 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013335 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013336 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013337
13338#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013339}
13340
Victor Stinnerca9381e2015-09-22 00:58:32 +020013341int
13342_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13343 enum PyUnicode_Kind kind)
13344{
13345 Py_UCS4 maxchar;
13346
13347 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13348 assert(writer->kind < kind);
13349
13350 switch (kind)
13351 {
13352 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13353 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13354 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13355 default:
13356 assert(0 && "invalid kind");
13357 return -1;
13358 }
13359
13360 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13361}
13362
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013363Py_LOCAL_INLINE(int)
13364_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013365{
13366 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13367 return -1;
13368 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13369 writer->pos++;
13370 return 0;
13371}
13372
13373int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013374_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13375{
13376 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13377}
13378
13379int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013380_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13381{
13382 Py_UCS4 maxchar;
13383 Py_ssize_t len;
13384
13385 if (PyUnicode_READY(str) == -1)
13386 return -1;
13387 len = PyUnicode_GET_LENGTH(str);
13388 if (len == 0)
13389 return 0;
13390 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13391 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013392 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013393 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013394 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013395 Py_INCREF(str);
13396 writer->buffer = str;
13397 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013398 writer->pos += len;
13399 return 0;
13400 }
13401 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13402 return -1;
13403 }
13404 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13405 str, 0, len);
13406 writer->pos += len;
13407 return 0;
13408}
13409
Victor Stinnere215d962012-10-06 23:03:36 +020013410int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013411_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13412 Py_ssize_t start, Py_ssize_t end)
13413{
13414 Py_UCS4 maxchar;
13415 Py_ssize_t len;
13416
13417 if (PyUnicode_READY(str) == -1)
13418 return -1;
13419
13420 assert(0 <= start);
13421 assert(end <= PyUnicode_GET_LENGTH(str));
13422 assert(start <= end);
13423
13424 if (end == 0)
13425 return 0;
13426
13427 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13428 return _PyUnicodeWriter_WriteStr(writer, str);
13429
13430 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13431 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13432 else
13433 maxchar = writer->maxchar;
13434 len = end - start;
13435
13436 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13437 return -1;
13438
13439 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13440 str, start, len);
13441 writer->pos += len;
13442 return 0;
13443}
13444
13445int
Victor Stinner4a587072013-11-19 12:54:53 +010013446_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13447 const char *ascii, Py_ssize_t len)
13448{
13449 if (len == -1)
13450 len = strlen(ascii);
13451
13452 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13453
13454 if (writer->buffer == NULL && !writer->overallocate) {
13455 PyObject *str;
13456
13457 str = _PyUnicode_FromASCII(ascii, len);
13458 if (str == NULL)
13459 return -1;
13460
13461 writer->readonly = 1;
13462 writer->buffer = str;
13463 _PyUnicodeWriter_Update(writer);
13464 writer->pos += len;
13465 return 0;
13466 }
13467
13468 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13469 return -1;
13470
13471 switch (writer->kind)
13472 {
13473 case PyUnicode_1BYTE_KIND:
13474 {
13475 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13476 Py_UCS1 *data = writer->data;
13477
13478 Py_MEMCPY(data + writer->pos, str, len);
13479 break;
13480 }
13481 case PyUnicode_2BYTE_KIND:
13482 {
13483 _PyUnicode_CONVERT_BYTES(
13484 Py_UCS1, Py_UCS2,
13485 ascii, ascii + len,
13486 (Py_UCS2 *)writer->data + writer->pos);
13487 break;
13488 }
13489 case PyUnicode_4BYTE_KIND:
13490 {
13491 _PyUnicode_CONVERT_BYTES(
13492 Py_UCS1, Py_UCS4,
13493 ascii, ascii + len,
13494 (Py_UCS4 *)writer->data + writer->pos);
13495 break;
13496 }
13497 default:
13498 assert(0);
13499 }
13500
13501 writer->pos += len;
13502 return 0;
13503}
13504
13505int
13506_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13507 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013508{
13509 Py_UCS4 maxchar;
13510
13511 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13512 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13513 return -1;
13514 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13515 writer->pos += len;
13516 return 0;
13517}
13518
Victor Stinnerd3f08822012-05-29 12:57:52 +020013519PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013520_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013521{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013522 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013523 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013524 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013525 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013526 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013527 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013528 str = writer->buffer;
13529 writer->buffer = NULL;
13530 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13531 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013532 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013533 if (writer->pos == 0) {
13534 Py_CLEAR(writer->buffer);
13535
13536 /* Get the empty Unicode string singleton ('') */
13537 _Py_INCREF_UNICODE_EMPTY();
13538 str = unicode_empty;
Victor Stinner202fdca2012-05-07 12:47:02 +020013539 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013540 else {
13541 str = writer->buffer;
13542 writer->buffer = NULL;
13543
13544 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13545 PyObject *str2;
13546 str2 = resize_compact(str, writer->pos);
13547 if (str2 == NULL)
13548 return NULL;
13549 str = str2;
13550 }
13551 }
13552
Victor Stinner15a0bd32013-07-08 22:29:55 +020013553 assert(_PyUnicode_CheckConsistency(str, 1));
13554 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013555}
13556
Victor Stinnerd3f08822012-05-29 12:57:52 +020013557void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013558_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013559{
13560 Py_CLEAR(writer->buffer);
13561}
13562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013563#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013564
13565PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013566 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013567\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013568Return a formatted version of S, using substitutions from args and kwargs.\n\
13569The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013570
Eric Smith27bbca62010-11-04 17:06:58 +000013571PyDoc_STRVAR(format_map__doc__,
13572 "S.format_map(mapping) -> str\n\
13573\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013574Return a formatted version of S, using substitutions from mapping.\n\
13575The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013576
Eric Smith4a7d76d2008-05-30 18:10:19 +000013577static PyObject *
13578unicode__format__(PyObject* self, PyObject* args)
13579{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013580 PyObject *format_spec;
13581 _PyUnicodeWriter writer;
13582 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013583
13584 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13585 return NULL;
13586
Victor Stinnerd3f08822012-05-29 12:57:52 +020013587 if (PyUnicode_READY(self) == -1)
13588 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013589 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013590 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13591 self, format_spec, 0,
13592 PyUnicode_GET_LENGTH(format_spec));
13593 if (ret == -1) {
13594 _PyUnicodeWriter_Dealloc(&writer);
13595 return NULL;
13596 }
13597 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013598}
13599
Eric Smith8c663262007-08-25 02:26:07 +000013600PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013601 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013602\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013603Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013604
13605static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013606unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013607{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013608 Py_ssize_t size;
13609
13610 /* If it's a compact object, account for base structure +
13611 character data. */
13612 if (PyUnicode_IS_COMPACT_ASCII(v))
13613 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13614 else if (PyUnicode_IS_COMPACT(v))
13615 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013616 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013617 else {
13618 /* If it is a two-block object, account for base object, and
13619 for character block if present. */
13620 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013621 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013622 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013623 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013624 }
13625 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013626 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013627 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013628 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013629 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013630 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013631
13632 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013633}
13634
13635PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013636 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013637
13638static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013639unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013640{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013641 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013642 if (!copy)
13643 return NULL;
13644 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013645}
13646
Guido van Rossumd57fd912000-03-10 22:53:23 +000013647static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013648 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013649 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013650 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13651 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013652 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13653 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013654 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013655 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13656 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13657 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013658 {"expandtabs", (PyCFunction) unicode_expandtabs,
13659 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013660 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013661 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013662 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13663 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13664 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013665 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013666 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13667 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13668 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013669 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013670 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013671 {"splitlines", (PyCFunction) unicode_splitlines,
13672 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013673 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013674 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13675 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13676 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13677 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13678 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13679 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13680 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13681 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13682 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13683 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13684 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13685 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13686 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13687 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013688 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013689 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013690 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013691 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013692 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013693 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013694 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013695 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013696#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013697 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013698 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013699#endif
13700
Benjamin Peterson14339b62009-01-31 16:36:08 +000013701 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013702 {NULL, NULL}
13703};
13704
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013705static PyObject *
13706unicode_mod(PyObject *v, PyObject *w)
13707{
Brian Curtindfc80e32011-08-10 20:28:54 -050013708 if (!PyUnicode_Check(v))
13709 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013710 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013711}
13712
13713static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013714 0, /*nb_add*/
13715 0, /*nb_subtract*/
13716 0, /*nb_multiply*/
13717 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013718};
13719
Guido van Rossumd57fd912000-03-10 22:53:23 +000013720static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013721 (lenfunc) unicode_length, /* sq_length */
13722 PyUnicode_Concat, /* sq_concat */
13723 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13724 (ssizeargfunc) unicode_getitem, /* sq_item */
13725 0, /* sq_slice */
13726 0, /* sq_ass_item */
13727 0, /* sq_ass_slice */
13728 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013729};
13730
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013731static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013732unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013733{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013734 if (PyUnicode_READY(self) == -1)
13735 return NULL;
13736
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013737 if (PyIndex_Check(item)) {
13738 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013739 if (i == -1 && PyErr_Occurred())
13740 return NULL;
13741 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013742 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013743 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013744 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013745 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013746 PyObject *result;
13747 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013748 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013749 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013751 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013752 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013753 return NULL;
13754 }
13755
13756 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013757 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013758 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013759 slicelength == PyUnicode_GET_LENGTH(self)) {
13760 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013761 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013762 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013763 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013764 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013765 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013766 src_kind = PyUnicode_KIND(self);
13767 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013768 if (!PyUnicode_IS_ASCII(self)) {
13769 kind_limit = kind_maxchar_limit(src_kind);
13770 max_char = 0;
13771 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13772 ch = PyUnicode_READ(src_kind, src_data, cur);
13773 if (ch > max_char) {
13774 max_char = ch;
13775 if (max_char >= kind_limit)
13776 break;
13777 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013778 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013779 }
Victor Stinner55c99112011-10-13 01:17:06 +020013780 else
13781 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013782 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013783 if (result == NULL)
13784 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013785 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013786 dest_data = PyUnicode_DATA(result);
13787
13788 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013789 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13790 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013791 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013792 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013793 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013794 } else {
13795 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13796 return NULL;
13797 }
13798}
13799
13800static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013801 (lenfunc)unicode_length, /* mp_length */
13802 (binaryfunc)unicode_subscript, /* mp_subscript */
13803 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013804};
13805
Guido van Rossumd57fd912000-03-10 22:53:23 +000013806
Guido van Rossumd57fd912000-03-10 22:53:23 +000013807/* Helpers for PyUnicode_Format() */
13808
Victor Stinnera47082312012-10-04 02:19:54 +020013809struct unicode_formatter_t {
13810 PyObject *args;
13811 int args_owned;
13812 Py_ssize_t arglen, argidx;
13813 PyObject *dict;
13814
13815 enum PyUnicode_Kind fmtkind;
13816 Py_ssize_t fmtcnt, fmtpos;
13817 void *fmtdata;
13818 PyObject *fmtstr;
13819
13820 _PyUnicodeWriter writer;
13821};
13822
13823struct unicode_format_arg_t {
13824 Py_UCS4 ch;
13825 int flags;
13826 Py_ssize_t width;
13827 int prec;
13828 int sign;
13829};
13830
Guido van Rossumd57fd912000-03-10 22:53:23 +000013831static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013832unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013833{
Victor Stinnera47082312012-10-04 02:19:54 +020013834 Py_ssize_t argidx = ctx->argidx;
13835
13836 if (argidx < ctx->arglen) {
13837 ctx->argidx++;
13838 if (ctx->arglen < 0)
13839 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013840 else
Victor Stinnera47082312012-10-04 02:19:54 +020013841 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013842 }
13843 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013844 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013845 return NULL;
13846}
13847
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013848/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013849
Victor Stinnera47082312012-10-04 02:19:54 +020013850/* Format a float into the writer if the writer is not NULL, or into *p_output
13851 otherwise.
13852
13853 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013854static int
Victor Stinnera47082312012-10-04 02:19:54 +020013855formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13856 PyObject **p_output,
13857 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013858{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013859 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013860 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013861 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013862 int prec;
13863 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013864
Guido van Rossumd57fd912000-03-10 22:53:23 +000013865 x = PyFloat_AsDouble(v);
13866 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013867 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013868
Victor Stinnera47082312012-10-04 02:19:54 +020013869 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013870 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013871 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013872
Victor Stinnera47082312012-10-04 02:19:54 +020013873 if (arg->flags & F_ALT)
13874 dtoa_flags = Py_DTSF_ALT;
13875 else
13876 dtoa_flags = 0;
13877 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013878 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013879 return -1;
13880 len = strlen(p);
13881 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013882 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013883 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013884 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013885 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013886 }
13887 else
13888 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013889 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013890 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013891}
13892
Victor Stinnerd0880d52012-04-27 23:40:13 +020013893/* formatlong() emulates the format codes d, u, o, x and X, and
13894 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13895 * Python's regular ints.
13896 * Return value: a new PyUnicodeObject*, or NULL if error.
13897 * The output string is of the form
13898 * "-"? ("0x" | "0X")? digit+
13899 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13900 * set in flags. The case of hex digits will be correct,
13901 * There will be at least prec digits, zero-filled on the left if
13902 * necessary to get that many.
13903 * val object to be converted
13904 * flags bitmask of format flags; only F_ALT is looked at
13905 * prec minimum number of digits; 0-fill on left if needed
13906 * type a character in [duoxX]; u acts the same as d
13907 *
13908 * CAUTION: o, x and X conversions on regular ints can never
13909 * produce a '-' sign, but can for Python's unbounded ints.
13910 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013911PyObject *
13912_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013913{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013914 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013915 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013916 Py_ssize_t i;
13917 int sign; /* 1 if '-', else 0 */
13918 int len; /* number of characters */
13919 Py_ssize_t llen;
13920 int numdigits; /* len == numnondigits + numdigits */
13921 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013922
Victor Stinnerd0880d52012-04-27 23:40:13 +020013923 /* Avoid exceeding SSIZE_T_MAX */
13924 if (prec > INT_MAX-3) {
13925 PyErr_SetString(PyExc_OverflowError,
13926 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013927 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013928 }
13929
13930 assert(PyLong_Check(val));
13931
13932 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013933 default:
13934 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013935 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013936 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013937 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013938 /* int and int subclasses should print numerically when a numeric */
13939 /* format code is used (see issue18780) */
13940 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013941 break;
13942 case 'o':
13943 numnondigits = 2;
13944 result = PyNumber_ToBase(val, 8);
13945 break;
13946 case 'x':
13947 case 'X':
13948 numnondigits = 2;
13949 result = PyNumber_ToBase(val, 16);
13950 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013951 }
13952 if (!result)
13953 return NULL;
13954
13955 assert(unicode_modifiable(result));
13956 assert(PyUnicode_IS_READY(result));
13957 assert(PyUnicode_IS_ASCII(result));
13958
13959 /* To modify the string in-place, there can only be one reference. */
13960 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013961 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013962 PyErr_BadInternalCall();
13963 return NULL;
13964 }
13965 buf = PyUnicode_DATA(result);
13966 llen = PyUnicode_GET_LENGTH(result);
13967 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013968 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013969 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080013970 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013971 return NULL;
13972 }
13973 len = (int)llen;
13974 sign = buf[0] == '-';
13975 numnondigits += sign;
13976 numdigits = len - numnondigits;
13977 assert(numdigits > 0);
13978
13979 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013980 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013981 (type == 'o' || type == 'x' || type == 'X'))) {
13982 assert(buf[sign] == '0');
13983 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13984 buf[sign+1] == 'o');
13985 numnondigits -= 2;
13986 buf += 2;
13987 len -= 2;
13988 if (sign)
13989 buf[0] = '-';
13990 assert(len == numnondigits + numdigits);
13991 assert(numdigits > 0);
13992 }
13993
13994 /* Fill with leading zeroes to meet minimum width. */
13995 if (prec > numdigits) {
13996 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13997 numnondigits + prec);
13998 char *b1;
13999 if (!r1) {
14000 Py_DECREF(result);
14001 return NULL;
14002 }
14003 b1 = PyBytes_AS_STRING(r1);
14004 for (i = 0; i < numnondigits; ++i)
14005 *b1++ = *buf++;
14006 for (i = 0; i < prec - numdigits; i++)
14007 *b1++ = '0';
14008 for (i = 0; i < numdigits; i++)
14009 *b1++ = *buf++;
14010 *b1 = '\0';
14011 Py_DECREF(result);
14012 result = r1;
14013 buf = PyBytes_AS_STRING(result);
14014 len = numnondigits + prec;
14015 }
14016
14017 /* Fix up case for hex conversions. */
14018 if (type == 'X') {
14019 /* Need to convert all lower case letters to upper case.
14020 and need to convert 0x to 0X (and -0x to -0X). */
14021 for (i = 0; i < len; i++)
14022 if (buf[i] >= 'a' && buf[i] <= 'x')
14023 buf[i] -= 'a'-'A';
14024 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014025 if (!PyUnicode_Check(result)
14026 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014027 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014028 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014029 Py_DECREF(result);
14030 result = unicode;
14031 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014032 else if (len != PyUnicode_GET_LENGTH(result)) {
14033 if (PyUnicode_Resize(&result, len) < 0)
14034 Py_CLEAR(result);
14035 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014036 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014037}
14038
Ethan Furmandf3ed242014-01-05 06:50:30 -080014039/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014040 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014041 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014042 * -1 and raise an exception on error */
14043static int
Victor Stinnera47082312012-10-04 02:19:54 +020014044mainformatlong(PyObject *v,
14045 struct unicode_format_arg_t *arg,
14046 PyObject **p_output,
14047 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014048{
14049 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014050 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014051
14052 if (!PyNumber_Check(v))
14053 goto wrongtype;
14054
Ethan Furman9ab74802014-03-21 06:38:46 -070014055 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014056 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014057 if (type == 'o' || type == 'x' || type == 'X') {
14058 iobj = PyNumber_Index(v);
14059 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014060 if (PyErr_ExceptionMatches(PyExc_TypeError))
14061 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014062 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014063 }
14064 }
14065 else {
14066 iobj = PyNumber_Long(v);
14067 if (iobj == NULL ) {
14068 if (PyErr_ExceptionMatches(PyExc_TypeError))
14069 goto wrongtype;
14070 return -1;
14071 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014072 }
14073 assert(PyLong_Check(iobj));
14074 }
14075 else {
14076 iobj = v;
14077 Py_INCREF(iobj);
14078 }
14079
14080 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014081 && arg->width == -1 && arg->prec == -1
14082 && !(arg->flags & (F_SIGN | F_BLANK))
14083 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014084 {
14085 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014086 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014087 int base;
14088
Victor Stinnera47082312012-10-04 02:19:54 +020014089 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014090 {
14091 default:
14092 assert(0 && "'type' not in [diuoxX]");
14093 case 'd':
14094 case 'i':
14095 case 'u':
14096 base = 10;
14097 break;
14098 case 'o':
14099 base = 8;
14100 break;
14101 case 'x':
14102 case 'X':
14103 base = 16;
14104 break;
14105 }
14106
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014107 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14108 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014109 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014110 }
14111 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014112 return 1;
14113 }
14114
Ethan Furmanb95b5612015-01-23 20:05:18 -080014115 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014116 Py_DECREF(iobj);
14117 if (res == NULL)
14118 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014119 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014120 return 0;
14121
14122wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014123 switch(type)
14124 {
14125 case 'o':
14126 case 'x':
14127 case 'X':
14128 PyErr_Format(PyExc_TypeError,
14129 "%%%c format: an integer is required, "
14130 "not %.200s",
14131 type, Py_TYPE(v)->tp_name);
14132 break;
14133 default:
14134 PyErr_Format(PyExc_TypeError,
14135 "%%%c format: a number is required, "
14136 "not %.200s",
14137 type, Py_TYPE(v)->tp_name);
14138 break;
14139 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014140 return -1;
14141}
14142
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014143static Py_UCS4
14144formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014145{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014146 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014147 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014148 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014149 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014150 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014151 goto onError;
14152 }
14153 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014154 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014155 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014156 /* make sure number is a type of integer */
14157 if (!PyLong_Check(v)) {
14158 iobj = PyNumber_Index(v);
14159 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014160 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014161 }
14162 v = iobj;
14163 Py_DECREF(iobj);
14164 }
14165 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014166 x = PyLong_AsLong(v);
14167 if (x == -1 && PyErr_Occurred())
14168 goto onError;
14169
Victor Stinner8faf8212011-12-08 22:14:11 +010014170 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014171 PyErr_SetString(PyExc_OverflowError,
14172 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014173 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014174 }
14175
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014176 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014177 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014178
Benjamin Peterson29060642009-01-31 22:14:21 +000014179 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014180 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014181 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014182 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014183}
14184
Victor Stinnera47082312012-10-04 02:19:54 +020014185/* Parse options of an argument: flags, width, precision.
14186 Handle also "%(name)" syntax.
14187
14188 Return 0 if the argument has been formatted into arg->str.
14189 Return 1 if the argument has been written into ctx->writer,
14190 Raise an exception and return -1 on error. */
14191static int
14192unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14193 struct unicode_format_arg_t *arg)
14194{
14195#define FORMAT_READ(ctx) \
14196 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14197
14198 PyObject *v;
14199
Victor Stinnera47082312012-10-04 02:19:54 +020014200 if (arg->ch == '(') {
14201 /* Get argument value from a dictionary. Example: "%(name)s". */
14202 Py_ssize_t keystart;
14203 Py_ssize_t keylen;
14204 PyObject *key;
14205 int pcount = 1;
14206
14207 if (ctx->dict == NULL) {
14208 PyErr_SetString(PyExc_TypeError,
14209 "format requires a mapping");
14210 return -1;
14211 }
14212 ++ctx->fmtpos;
14213 --ctx->fmtcnt;
14214 keystart = ctx->fmtpos;
14215 /* Skip over balanced parentheses */
14216 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14217 arg->ch = FORMAT_READ(ctx);
14218 if (arg->ch == ')')
14219 --pcount;
14220 else if (arg->ch == '(')
14221 ++pcount;
14222 ctx->fmtpos++;
14223 }
14224 keylen = ctx->fmtpos - keystart - 1;
14225 if (ctx->fmtcnt < 0 || pcount > 0) {
14226 PyErr_SetString(PyExc_ValueError,
14227 "incomplete format key");
14228 return -1;
14229 }
14230 key = PyUnicode_Substring(ctx->fmtstr,
14231 keystart, keystart + keylen);
14232 if (key == NULL)
14233 return -1;
14234 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014235 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014236 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014237 }
14238 ctx->args = PyObject_GetItem(ctx->dict, key);
14239 Py_DECREF(key);
14240 if (ctx->args == NULL)
14241 return -1;
14242 ctx->args_owned = 1;
14243 ctx->arglen = -1;
14244 ctx->argidx = -2;
14245 }
14246
14247 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014248 while (--ctx->fmtcnt >= 0) {
14249 arg->ch = FORMAT_READ(ctx);
14250 ctx->fmtpos++;
14251 switch (arg->ch) {
14252 case '-': arg->flags |= F_LJUST; continue;
14253 case '+': arg->flags |= F_SIGN; continue;
14254 case ' ': arg->flags |= F_BLANK; continue;
14255 case '#': arg->flags |= F_ALT; continue;
14256 case '0': arg->flags |= F_ZERO; continue;
14257 }
14258 break;
14259 }
14260
14261 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014262 if (arg->ch == '*') {
14263 v = unicode_format_getnextarg(ctx);
14264 if (v == NULL)
14265 return -1;
14266 if (!PyLong_Check(v)) {
14267 PyErr_SetString(PyExc_TypeError,
14268 "* wants int");
14269 return -1;
14270 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014271 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014272 if (arg->width == -1 && PyErr_Occurred())
14273 return -1;
14274 if (arg->width < 0) {
14275 arg->flags |= F_LJUST;
14276 arg->width = -arg->width;
14277 }
14278 if (--ctx->fmtcnt >= 0) {
14279 arg->ch = FORMAT_READ(ctx);
14280 ctx->fmtpos++;
14281 }
14282 }
14283 else if (arg->ch >= '0' && arg->ch <= '9') {
14284 arg->width = arg->ch - '0';
14285 while (--ctx->fmtcnt >= 0) {
14286 arg->ch = FORMAT_READ(ctx);
14287 ctx->fmtpos++;
14288 if (arg->ch < '0' || arg->ch > '9')
14289 break;
14290 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14291 mixing signed and unsigned comparison. Since arg->ch is between
14292 '0' and '9', casting to int is safe. */
14293 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14294 PyErr_SetString(PyExc_ValueError,
14295 "width too big");
14296 return -1;
14297 }
14298 arg->width = arg->width*10 + (arg->ch - '0');
14299 }
14300 }
14301
14302 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014303 if (arg->ch == '.') {
14304 arg->prec = 0;
14305 if (--ctx->fmtcnt >= 0) {
14306 arg->ch = FORMAT_READ(ctx);
14307 ctx->fmtpos++;
14308 }
14309 if (arg->ch == '*') {
14310 v = unicode_format_getnextarg(ctx);
14311 if (v == NULL)
14312 return -1;
14313 if (!PyLong_Check(v)) {
14314 PyErr_SetString(PyExc_TypeError,
14315 "* wants int");
14316 return -1;
14317 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014318 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014319 if (arg->prec == -1 && PyErr_Occurred())
14320 return -1;
14321 if (arg->prec < 0)
14322 arg->prec = 0;
14323 if (--ctx->fmtcnt >= 0) {
14324 arg->ch = FORMAT_READ(ctx);
14325 ctx->fmtpos++;
14326 }
14327 }
14328 else if (arg->ch >= '0' && arg->ch <= '9') {
14329 arg->prec = arg->ch - '0';
14330 while (--ctx->fmtcnt >= 0) {
14331 arg->ch = FORMAT_READ(ctx);
14332 ctx->fmtpos++;
14333 if (arg->ch < '0' || arg->ch > '9')
14334 break;
14335 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14336 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014337 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014338 return -1;
14339 }
14340 arg->prec = arg->prec*10 + (arg->ch - '0');
14341 }
14342 }
14343 }
14344
14345 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14346 if (ctx->fmtcnt >= 0) {
14347 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14348 if (--ctx->fmtcnt >= 0) {
14349 arg->ch = FORMAT_READ(ctx);
14350 ctx->fmtpos++;
14351 }
14352 }
14353 }
14354 if (ctx->fmtcnt < 0) {
14355 PyErr_SetString(PyExc_ValueError,
14356 "incomplete format");
14357 return -1;
14358 }
14359 return 0;
14360
14361#undef FORMAT_READ
14362}
14363
14364/* Format one argument. Supported conversion specifiers:
14365
14366 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014367 - "i", "d", "u": int or float
14368 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014369 - "e", "E", "f", "F", "g", "G": float
14370 - "c": int or str (1 character)
14371
Victor Stinner8dbd4212012-12-04 09:30:24 +010014372 When possible, the output is written directly into the Unicode writer
14373 (ctx->writer). A string is created when padding is required.
14374
Victor Stinnera47082312012-10-04 02:19:54 +020014375 Return 0 if the argument has been formatted into *p_str,
14376 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014377 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014378static int
14379unicode_format_arg_format(struct unicode_formatter_t *ctx,
14380 struct unicode_format_arg_t *arg,
14381 PyObject **p_str)
14382{
14383 PyObject *v;
14384 _PyUnicodeWriter *writer = &ctx->writer;
14385
14386 if (ctx->fmtcnt == 0)
14387 ctx->writer.overallocate = 0;
14388
14389 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014390 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014391 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014392 return 1;
14393 }
14394
14395 v = unicode_format_getnextarg(ctx);
14396 if (v == NULL)
14397 return -1;
14398
Victor Stinnera47082312012-10-04 02:19:54 +020014399
14400 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014401 case 's':
14402 case 'r':
14403 case 'a':
14404 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14405 /* Fast path */
14406 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14407 return -1;
14408 return 1;
14409 }
14410
14411 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14412 *p_str = v;
14413 Py_INCREF(*p_str);
14414 }
14415 else {
14416 if (arg->ch == 's')
14417 *p_str = PyObject_Str(v);
14418 else if (arg->ch == 'r')
14419 *p_str = PyObject_Repr(v);
14420 else
14421 *p_str = PyObject_ASCII(v);
14422 }
14423 break;
14424
14425 case 'i':
14426 case 'd':
14427 case 'u':
14428 case 'o':
14429 case 'x':
14430 case 'X':
14431 {
14432 int ret = mainformatlong(v, arg, p_str, writer);
14433 if (ret != 0)
14434 return ret;
14435 arg->sign = 1;
14436 break;
14437 }
14438
14439 case 'e':
14440 case 'E':
14441 case 'f':
14442 case 'F':
14443 case 'g':
14444 case 'G':
14445 if (arg->width == -1 && arg->prec == -1
14446 && !(arg->flags & (F_SIGN | F_BLANK)))
14447 {
14448 /* Fast path */
14449 if (formatfloat(v, arg, NULL, writer) == -1)
14450 return -1;
14451 return 1;
14452 }
14453
14454 arg->sign = 1;
14455 if (formatfloat(v, arg, p_str, NULL) == -1)
14456 return -1;
14457 break;
14458
14459 case 'c':
14460 {
14461 Py_UCS4 ch = formatchar(v);
14462 if (ch == (Py_UCS4) -1)
14463 return -1;
14464 if (arg->width == -1 && arg->prec == -1) {
14465 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014466 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014467 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014468 return 1;
14469 }
14470 *p_str = PyUnicode_FromOrdinal(ch);
14471 break;
14472 }
14473
14474 default:
14475 PyErr_Format(PyExc_ValueError,
14476 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014477 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014478 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14479 (int)arg->ch,
14480 ctx->fmtpos - 1);
14481 return -1;
14482 }
14483 if (*p_str == NULL)
14484 return -1;
14485 assert (PyUnicode_Check(*p_str));
14486 return 0;
14487}
14488
14489static int
14490unicode_format_arg_output(struct unicode_formatter_t *ctx,
14491 struct unicode_format_arg_t *arg,
14492 PyObject *str)
14493{
14494 Py_ssize_t len;
14495 enum PyUnicode_Kind kind;
14496 void *pbuf;
14497 Py_ssize_t pindex;
14498 Py_UCS4 signchar;
14499 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014500 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014501 Py_ssize_t sublen;
14502 _PyUnicodeWriter *writer = &ctx->writer;
14503 Py_UCS4 fill;
14504
14505 fill = ' ';
14506 if (arg->sign && arg->flags & F_ZERO)
14507 fill = '0';
14508
14509 if (PyUnicode_READY(str) == -1)
14510 return -1;
14511
14512 len = PyUnicode_GET_LENGTH(str);
14513 if ((arg->width == -1 || arg->width <= len)
14514 && (arg->prec == -1 || arg->prec >= len)
14515 && !(arg->flags & (F_SIGN | F_BLANK)))
14516 {
14517 /* Fast path */
14518 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14519 return -1;
14520 return 0;
14521 }
14522
14523 /* Truncate the string for "s", "r" and "a" formats
14524 if the precision is set */
14525 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14526 if (arg->prec >= 0 && len > arg->prec)
14527 len = arg->prec;
14528 }
14529
14530 /* Adjust sign and width */
14531 kind = PyUnicode_KIND(str);
14532 pbuf = PyUnicode_DATA(str);
14533 pindex = 0;
14534 signchar = '\0';
14535 if (arg->sign) {
14536 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14537 if (ch == '-' || ch == '+') {
14538 signchar = ch;
14539 len--;
14540 pindex++;
14541 }
14542 else if (arg->flags & F_SIGN)
14543 signchar = '+';
14544 else if (arg->flags & F_BLANK)
14545 signchar = ' ';
14546 else
14547 arg->sign = 0;
14548 }
14549 if (arg->width < len)
14550 arg->width = len;
14551
14552 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014553 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014554 if (!(arg->flags & F_LJUST)) {
14555 if (arg->sign) {
14556 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014557 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014558 }
14559 else {
14560 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014561 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014562 }
14563 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014564 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14565 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014566 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014567 }
14568
Victor Stinnera47082312012-10-04 02:19:54 +020014569 buflen = arg->width;
14570 if (arg->sign && len == arg->width)
14571 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014572 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014573 return -1;
14574
14575 /* Write the sign if needed */
14576 if (arg->sign) {
14577 if (fill != ' ') {
14578 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14579 writer->pos += 1;
14580 }
14581 if (arg->width > len)
14582 arg->width--;
14583 }
14584
14585 /* Write the numeric prefix for "x", "X" and "o" formats
14586 if the alternate form is used.
14587 For example, write "0x" for the "%#x" format. */
14588 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14589 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14590 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14591 if (fill != ' ') {
14592 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14593 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14594 writer->pos += 2;
14595 pindex += 2;
14596 }
14597 arg->width -= 2;
14598 if (arg->width < 0)
14599 arg->width = 0;
14600 len -= 2;
14601 }
14602
14603 /* Pad left with the fill character if needed */
14604 if (arg->width > len && !(arg->flags & F_LJUST)) {
14605 sublen = arg->width - len;
14606 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14607 writer->pos += sublen;
14608 arg->width = len;
14609 }
14610
14611 /* If padding with spaces: write sign if needed and/or numeric prefix if
14612 the alternate form is used */
14613 if (fill == ' ') {
14614 if (arg->sign) {
14615 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14616 writer->pos += 1;
14617 }
14618 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14619 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14620 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14621 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14622 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14623 writer->pos += 2;
14624 pindex += 2;
14625 }
14626 }
14627
14628 /* Write characters */
14629 if (len) {
14630 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14631 str, pindex, len);
14632 writer->pos += len;
14633 }
14634
14635 /* Pad right with the fill character if needed */
14636 if (arg->width > len) {
14637 sublen = arg->width - len;
14638 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14639 writer->pos += sublen;
14640 }
14641 return 0;
14642}
14643
14644/* Helper of PyUnicode_Format(): format one arg.
14645 Return 0 on success, raise an exception and return -1 on error. */
14646static int
14647unicode_format_arg(struct unicode_formatter_t *ctx)
14648{
14649 struct unicode_format_arg_t arg;
14650 PyObject *str;
14651 int ret;
14652
Victor Stinner8dbd4212012-12-04 09:30:24 +010014653 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14654 arg.flags = 0;
14655 arg.width = -1;
14656 arg.prec = -1;
14657 arg.sign = 0;
14658 str = NULL;
14659
Victor Stinnera47082312012-10-04 02:19:54 +020014660 ret = unicode_format_arg_parse(ctx, &arg);
14661 if (ret == -1)
14662 return -1;
14663
14664 ret = unicode_format_arg_format(ctx, &arg, &str);
14665 if (ret == -1)
14666 return -1;
14667
14668 if (ret != 1) {
14669 ret = unicode_format_arg_output(ctx, &arg, str);
14670 Py_DECREF(str);
14671 if (ret == -1)
14672 return -1;
14673 }
14674
14675 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14676 PyErr_SetString(PyExc_TypeError,
14677 "not all arguments converted during string formatting");
14678 return -1;
14679 }
14680 return 0;
14681}
14682
Alexander Belopolsky40018472011-02-26 01:02:56 +000014683PyObject *
14684PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014685{
Victor Stinnera47082312012-10-04 02:19:54 +020014686 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014687
Guido van Rossumd57fd912000-03-10 22:53:23 +000014688 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014689 PyErr_BadInternalCall();
14690 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014691 }
Victor Stinnera47082312012-10-04 02:19:54 +020014692
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014693 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014694 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014695
14696 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014697 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14698 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14699 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14700 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014701
Victor Stinner8f674cc2013-04-17 23:02:17 +020014702 _PyUnicodeWriter_Init(&ctx.writer);
14703 ctx.writer.min_length = ctx.fmtcnt + 100;
14704 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014705
Guido van Rossumd57fd912000-03-10 22:53:23 +000014706 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014707 ctx.arglen = PyTuple_Size(args);
14708 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014709 }
14710 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014711 ctx.arglen = -1;
14712 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014713 }
Victor Stinnera47082312012-10-04 02:19:54 +020014714 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014715 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014716 ctx.dict = args;
14717 else
14718 ctx.dict = NULL;
14719 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014720
Victor Stinnera47082312012-10-04 02:19:54 +020014721 while (--ctx.fmtcnt >= 0) {
14722 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014723 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014724
14725 nonfmtpos = ctx.fmtpos++;
14726 while (ctx.fmtcnt >= 0 &&
14727 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14728 ctx.fmtpos++;
14729 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014730 }
Victor Stinnera47082312012-10-04 02:19:54 +020014731 if (ctx.fmtcnt < 0) {
14732 ctx.fmtpos--;
14733 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014734 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014735
Victor Stinnercfc4c132013-04-03 01:48:39 +020014736 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14737 nonfmtpos, ctx.fmtpos) < 0)
14738 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014739 }
14740 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014741 ctx.fmtpos++;
14742 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014743 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014744 }
14745 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014746
Victor Stinnera47082312012-10-04 02:19:54 +020014747 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014748 PyErr_SetString(PyExc_TypeError,
14749 "not all arguments converted during string formatting");
14750 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014751 }
14752
Victor Stinnera47082312012-10-04 02:19:54 +020014753 if (ctx.args_owned) {
14754 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014755 }
Victor Stinnera47082312012-10-04 02:19:54 +020014756 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014757
Benjamin Peterson29060642009-01-31 22:14:21 +000014758 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014759 _PyUnicodeWriter_Dealloc(&ctx.writer);
14760 if (ctx.args_owned) {
14761 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014762 }
14763 return NULL;
14764}
14765
Jeremy Hylton938ace62002-07-17 16:30:39 +000014766static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014767unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14768
Tim Peters6d6c1a32001-08-02 04:15:00 +000014769static PyObject *
14770unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14771{
Benjamin Peterson29060642009-01-31 22:14:21 +000014772 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014773 static char *kwlist[] = {"object", "encoding", "errors", 0};
14774 char *encoding = NULL;
14775 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014776
Benjamin Peterson14339b62009-01-31 16:36:08 +000014777 if (type != &PyUnicode_Type)
14778 return unicode_subtype_new(type, args, kwds);
14779 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014780 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014781 return NULL;
14782 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014783 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014784 if (encoding == NULL && errors == NULL)
14785 return PyObject_Str(x);
14786 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014787 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014788}
14789
Guido van Rossume023fe02001-08-30 03:12:59 +000014790static PyObject *
14791unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14792{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014793 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014794 Py_ssize_t length, char_size;
14795 int share_wstr, share_utf8;
14796 unsigned int kind;
14797 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014798
Benjamin Peterson14339b62009-01-31 16:36:08 +000014799 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014800
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014801 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014802 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014803 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014804 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014805 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014806 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014807 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014808 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014809
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014810 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014811 if (self == NULL) {
14812 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014813 return NULL;
14814 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014815 kind = PyUnicode_KIND(unicode);
14816 length = PyUnicode_GET_LENGTH(unicode);
14817
14818 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014819#ifdef Py_DEBUG
14820 _PyUnicode_HASH(self) = -1;
14821#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014822 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014823#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014824 _PyUnicode_STATE(self).interned = 0;
14825 _PyUnicode_STATE(self).kind = kind;
14826 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014827 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014828 _PyUnicode_STATE(self).ready = 1;
14829 _PyUnicode_WSTR(self) = NULL;
14830 _PyUnicode_UTF8_LENGTH(self) = 0;
14831 _PyUnicode_UTF8(self) = NULL;
14832 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014833 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014834
14835 share_utf8 = 0;
14836 share_wstr = 0;
14837 if (kind == PyUnicode_1BYTE_KIND) {
14838 char_size = 1;
14839 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14840 share_utf8 = 1;
14841 }
14842 else if (kind == PyUnicode_2BYTE_KIND) {
14843 char_size = 2;
14844 if (sizeof(wchar_t) == 2)
14845 share_wstr = 1;
14846 }
14847 else {
14848 assert(kind == PyUnicode_4BYTE_KIND);
14849 char_size = 4;
14850 if (sizeof(wchar_t) == 4)
14851 share_wstr = 1;
14852 }
14853
14854 /* Ensure we won't overflow the length. */
14855 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14856 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014857 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014858 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014859 data = PyObject_MALLOC((length + 1) * char_size);
14860 if (data == NULL) {
14861 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014862 goto onError;
14863 }
14864
Victor Stinnerc3c74152011-10-02 20:39:55 +020014865 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014866 if (share_utf8) {
14867 _PyUnicode_UTF8_LENGTH(self) = length;
14868 _PyUnicode_UTF8(self) = data;
14869 }
14870 if (share_wstr) {
14871 _PyUnicode_WSTR_LENGTH(self) = length;
14872 _PyUnicode_WSTR(self) = (wchar_t *)data;
14873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014874
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014875 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014876 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014877 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014878#ifdef Py_DEBUG
14879 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14880#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014881 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014882 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014883
14884onError:
14885 Py_DECREF(unicode);
14886 Py_DECREF(self);
14887 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014888}
14889
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014890PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014891"str(object='') -> str\n\
14892str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014893\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014894Create a new string object from the given object. If encoding or\n\
14895errors is specified, then the object must expose a data buffer\n\
14896that will be decoded using the given encoding and error handler.\n\
14897Otherwise, returns the result of object.__str__() (if defined)\n\
14898or repr(object).\n\
14899encoding defaults to sys.getdefaultencoding().\n\
14900errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014901
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014902static PyObject *unicode_iter(PyObject *seq);
14903
Guido van Rossumd57fd912000-03-10 22:53:23 +000014904PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014905 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014906 "str", /* tp_name */
14907 sizeof(PyUnicodeObject), /* tp_size */
14908 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014909 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014910 (destructor)unicode_dealloc, /* tp_dealloc */
14911 0, /* tp_print */
14912 0, /* tp_getattr */
14913 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014914 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014915 unicode_repr, /* tp_repr */
14916 &unicode_as_number, /* tp_as_number */
14917 &unicode_as_sequence, /* tp_as_sequence */
14918 &unicode_as_mapping, /* tp_as_mapping */
14919 (hashfunc) unicode_hash, /* tp_hash*/
14920 0, /* tp_call*/
14921 (reprfunc) unicode_str, /* tp_str */
14922 PyObject_GenericGetAttr, /* tp_getattro */
14923 0, /* tp_setattro */
14924 0, /* tp_as_buffer */
14925 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014926 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014927 unicode_doc, /* tp_doc */
14928 0, /* tp_traverse */
14929 0, /* tp_clear */
14930 PyUnicode_RichCompare, /* tp_richcompare */
14931 0, /* tp_weaklistoffset */
14932 unicode_iter, /* tp_iter */
14933 0, /* tp_iternext */
14934 unicode_methods, /* tp_methods */
14935 0, /* tp_members */
14936 0, /* tp_getset */
14937 &PyBaseObject_Type, /* tp_base */
14938 0, /* tp_dict */
14939 0, /* tp_descr_get */
14940 0, /* tp_descr_set */
14941 0, /* tp_dictoffset */
14942 0, /* tp_init */
14943 0, /* tp_alloc */
14944 unicode_new, /* tp_new */
14945 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014946};
14947
14948/* Initialize the Unicode implementation */
14949
Victor Stinner3a50e702011-10-18 21:21:00 +020014950int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014951{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014952 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014953 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014954 0x000A, /* LINE FEED */
14955 0x000D, /* CARRIAGE RETURN */
14956 0x001C, /* FILE SEPARATOR */
14957 0x001D, /* GROUP SEPARATOR */
14958 0x001E, /* RECORD SEPARATOR */
14959 0x0085, /* NEXT LINE */
14960 0x2028, /* LINE SEPARATOR */
14961 0x2029, /* PARAGRAPH SEPARATOR */
14962 };
14963
Fred Drakee4315f52000-05-09 19:53:39 +000014964 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014965 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014966 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014967 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014968 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014969
Guido van Rossumcacfc072002-05-24 19:01:59 +000014970 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014971 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014972
14973 /* initialize the linebreak bloom filter */
14974 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014975 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014976 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014977
Christian Heimes26532f72013-07-20 14:57:16 +020014978 if (PyType_Ready(&EncodingMapType) < 0)
14979 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014980
Benjamin Petersonc4311282012-10-30 23:21:10 -040014981 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14982 Py_FatalError("Can't initialize field name iterator type");
14983
14984 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14985 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014986
Victor Stinner3a50e702011-10-18 21:21:00 +020014987 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014988}
14989
14990/* Finalize the Unicode implementation */
14991
Christian Heimesa156e092008-02-16 07:38:31 +000014992int
14993PyUnicode_ClearFreeList(void)
14994{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014995 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014996}
14997
Guido van Rossumd57fd912000-03-10 22:53:23 +000014998void
Thomas Wouters78890102000-07-22 19:25:51 +000014999_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015000{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015001 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015002
Serhiy Storchaka05997252013-01-26 12:14:02 +020015003 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015004
Serhiy Storchaka05997252013-01-26 12:14:02 +020015005 for (i = 0; i < 256; i++)
15006 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015007 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015008 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015009}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015010
Walter Dörwald16807132007-05-25 13:52:07 +000015011void
15012PyUnicode_InternInPlace(PyObject **p)
15013{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015014 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015015 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015016#ifdef Py_DEBUG
15017 assert(s != NULL);
15018 assert(_PyUnicode_CHECK(s));
15019#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015020 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015021 return;
15022#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015023 /* If it's a subclass, we don't really know what putting
15024 it in the interned dict might do. */
15025 if (!PyUnicode_CheckExact(s))
15026 return;
15027 if (PyUnicode_CHECK_INTERNED(s))
15028 return;
15029 if (interned == NULL) {
15030 interned = PyDict_New();
15031 if (interned == NULL) {
15032 PyErr_Clear(); /* Don't leave an exception */
15033 return;
15034 }
15035 }
15036 /* It might be that the GetItem call fails even
15037 though the key is present in the dictionary,
15038 namely when this happens during a stack overflow. */
15039 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015040 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015041 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015042
Victor Stinnerf0335102013-04-14 19:13:03 +020015043 if (t) {
15044 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015045 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015046 return;
15047 }
Walter Dörwald16807132007-05-25 13:52:07 +000015048
Benjamin Peterson14339b62009-01-31 16:36:08 +000015049 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015050 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015051 PyErr_Clear();
15052 PyThreadState_GET()->recursion_critical = 0;
15053 return;
15054 }
15055 PyThreadState_GET()->recursion_critical = 0;
15056 /* The two references in interned are not counted by refcnt.
15057 The deallocator will take care of this */
15058 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015059 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015060}
15061
15062void
15063PyUnicode_InternImmortal(PyObject **p)
15064{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015065 PyUnicode_InternInPlace(p);
15066 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015067 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015068 Py_INCREF(*p);
15069 }
Walter Dörwald16807132007-05-25 13:52:07 +000015070}
15071
15072PyObject *
15073PyUnicode_InternFromString(const char *cp)
15074{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015075 PyObject *s = PyUnicode_FromString(cp);
15076 if (s == NULL)
15077 return NULL;
15078 PyUnicode_InternInPlace(&s);
15079 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015080}
15081
Alexander Belopolsky40018472011-02-26 01:02:56 +000015082void
15083_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015084{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015085 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015086 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015087 Py_ssize_t i, n;
15088 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015089
Benjamin Peterson14339b62009-01-31 16:36:08 +000015090 if (interned == NULL || !PyDict_Check(interned))
15091 return;
15092 keys = PyDict_Keys(interned);
15093 if (keys == NULL || !PyList_Check(keys)) {
15094 PyErr_Clear();
15095 return;
15096 }
Walter Dörwald16807132007-05-25 13:52:07 +000015097
Benjamin Peterson14339b62009-01-31 16:36:08 +000015098 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15099 detector, interned unicode strings are not forcibly deallocated;
15100 rather, we give them their stolen references back, and then clear
15101 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015102
Benjamin Peterson14339b62009-01-31 16:36:08 +000015103 n = PyList_GET_SIZE(keys);
15104 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015105 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015106 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015107 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015108 if (PyUnicode_READY(s) == -1) {
15109 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015110 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015111 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015112 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015113 case SSTATE_NOT_INTERNED:
15114 /* XXX Shouldn't happen */
15115 break;
15116 case SSTATE_INTERNED_IMMORTAL:
15117 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015118 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015119 break;
15120 case SSTATE_INTERNED_MORTAL:
15121 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015122 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015123 break;
15124 default:
15125 Py_FatalError("Inconsistent interned string state.");
15126 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015127 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015128 }
15129 fprintf(stderr, "total size of all interned strings: "
15130 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15131 "mortal/immortal\n", mortal_size, immortal_size);
15132 Py_DECREF(keys);
15133 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015134 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015135}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015136
15137
15138/********************* Unicode Iterator **************************/
15139
15140typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015141 PyObject_HEAD
15142 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015143 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015144} unicodeiterobject;
15145
15146static void
15147unicodeiter_dealloc(unicodeiterobject *it)
15148{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015149 _PyObject_GC_UNTRACK(it);
15150 Py_XDECREF(it->it_seq);
15151 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015152}
15153
15154static int
15155unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15156{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015157 Py_VISIT(it->it_seq);
15158 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015159}
15160
15161static PyObject *
15162unicodeiter_next(unicodeiterobject *it)
15163{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015164 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015165
Benjamin Peterson14339b62009-01-31 16:36:08 +000015166 assert(it != NULL);
15167 seq = it->it_seq;
15168 if (seq == NULL)
15169 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015170 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015172 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15173 int kind = PyUnicode_KIND(seq);
15174 void *data = PyUnicode_DATA(seq);
15175 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15176 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015177 if (item != NULL)
15178 ++it->it_index;
15179 return item;
15180 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015181
Benjamin Peterson14339b62009-01-31 16:36:08 +000015182 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015183 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015184 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015185}
15186
15187static PyObject *
15188unicodeiter_len(unicodeiterobject *it)
15189{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015190 Py_ssize_t len = 0;
15191 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015192 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015193 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015194}
15195
15196PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15197
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015198static PyObject *
15199unicodeiter_reduce(unicodeiterobject *it)
15200{
15201 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015202 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015203 it->it_seq, it->it_index);
15204 } else {
15205 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15206 if (u == NULL)
15207 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015208 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015209 }
15210}
15211
15212PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15213
15214static PyObject *
15215unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15216{
15217 Py_ssize_t index = PyLong_AsSsize_t(state);
15218 if (index == -1 && PyErr_Occurred())
15219 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015220 if (it->it_seq != NULL) {
15221 if (index < 0)
15222 index = 0;
15223 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15224 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15225 it->it_index = index;
15226 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015227 Py_RETURN_NONE;
15228}
15229
15230PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15231
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015232static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015233 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015234 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015235 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15236 reduce_doc},
15237 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15238 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015239 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015240};
15241
15242PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015243 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15244 "str_iterator", /* tp_name */
15245 sizeof(unicodeiterobject), /* tp_basicsize */
15246 0, /* tp_itemsize */
15247 /* methods */
15248 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15249 0, /* tp_print */
15250 0, /* tp_getattr */
15251 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015252 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015253 0, /* tp_repr */
15254 0, /* tp_as_number */
15255 0, /* tp_as_sequence */
15256 0, /* tp_as_mapping */
15257 0, /* tp_hash */
15258 0, /* tp_call */
15259 0, /* tp_str */
15260 PyObject_GenericGetAttr, /* tp_getattro */
15261 0, /* tp_setattro */
15262 0, /* tp_as_buffer */
15263 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15264 0, /* tp_doc */
15265 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15266 0, /* tp_clear */
15267 0, /* tp_richcompare */
15268 0, /* tp_weaklistoffset */
15269 PyObject_SelfIter, /* tp_iter */
15270 (iternextfunc)unicodeiter_next, /* tp_iternext */
15271 unicodeiter_methods, /* tp_methods */
15272 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015273};
15274
15275static PyObject *
15276unicode_iter(PyObject *seq)
15277{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015278 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015279
Benjamin Peterson14339b62009-01-31 16:36:08 +000015280 if (!PyUnicode_Check(seq)) {
15281 PyErr_BadInternalCall();
15282 return NULL;
15283 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015284 if (PyUnicode_READY(seq) == -1)
15285 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015286 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15287 if (it == NULL)
15288 return NULL;
15289 it->it_index = 0;
15290 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015291 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015292 _PyObject_GC_TRACK(it);
15293 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015294}
15295
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015296
15297size_t
15298Py_UNICODE_strlen(const Py_UNICODE *u)
15299{
15300 int res = 0;
15301 while(*u++)
15302 res++;
15303 return res;
15304}
15305
15306Py_UNICODE*
15307Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15308{
15309 Py_UNICODE *u = s1;
15310 while ((*u++ = *s2++));
15311 return s1;
15312}
15313
15314Py_UNICODE*
15315Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15316{
15317 Py_UNICODE *u = s1;
15318 while ((*u++ = *s2++))
15319 if (n-- == 0)
15320 break;
15321 return s1;
15322}
15323
15324Py_UNICODE*
15325Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15326{
15327 Py_UNICODE *u1 = s1;
15328 u1 += Py_UNICODE_strlen(u1);
15329 Py_UNICODE_strcpy(u1, s2);
15330 return s1;
15331}
15332
15333int
15334Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15335{
15336 while (*s1 && *s2 && *s1 == *s2)
15337 s1++, s2++;
15338 if (*s1 && *s2)
15339 return (*s1 < *s2) ? -1 : +1;
15340 if (*s1)
15341 return 1;
15342 if (*s2)
15343 return -1;
15344 return 0;
15345}
15346
15347int
15348Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15349{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015350 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015351 for (; n != 0; n--) {
15352 u1 = *s1;
15353 u2 = *s2;
15354 if (u1 != u2)
15355 return (u1 < u2) ? -1 : +1;
15356 if (u1 == '\0')
15357 return 0;
15358 s1++;
15359 s2++;
15360 }
15361 return 0;
15362}
15363
15364Py_UNICODE*
15365Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15366{
15367 const Py_UNICODE *p;
15368 for (p = s; *p; p++)
15369 if (*p == c)
15370 return (Py_UNICODE*)p;
15371 return NULL;
15372}
15373
15374Py_UNICODE*
15375Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15376{
15377 const Py_UNICODE *p;
15378 p = s + Py_UNICODE_strlen(s);
15379 while (p != s) {
15380 p--;
15381 if (*p == c)
15382 return (Py_UNICODE*)p;
15383 }
15384 return NULL;
15385}
Victor Stinner331ea922010-08-10 16:37:20 +000015386
Victor Stinner71133ff2010-09-01 23:43:53 +000015387Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015388PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015389{
Victor Stinner577db2c2011-10-11 22:12:48 +020015390 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015391 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015393 if (!PyUnicode_Check(unicode)) {
15394 PyErr_BadArgument();
15395 return NULL;
15396 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015397 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015398 if (u == NULL)
15399 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015400 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015401 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015402 PyErr_NoMemory();
15403 return NULL;
15404 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015405 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015406 size *= sizeof(Py_UNICODE);
15407 copy = PyMem_Malloc(size);
15408 if (copy == NULL) {
15409 PyErr_NoMemory();
15410 return NULL;
15411 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015412 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015413 return copy;
15414}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015415
Georg Brandl66c221e2010-10-14 07:04:07 +000015416/* A _string module, to export formatter_parser and formatter_field_name_split
15417 to the string.Formatter class implemented in Python. */
15418
15419static PyMethodDef _string_methods[] = {
15420 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15421 METH_O, PyDoc_STR("split the argument as a field name")},
15422 {"formatter_parser", (PyCFunction) formatter_parser,
15423 METH_O, PyDoc_STR("parse the argument as a format string")},
15424 {NULL, NULL}
15425};
15426
15427static struct PyModuleDef _string_module = {
15428 PyModuleDef_HEAD_INIT,
15429 "_string",
15430 PyDoc_STR("string helper module"),
15431 0,
15432 _string_methods,
15433 NULL,
15434 NULL,
15435 NULL,
15436 NULL
15437};
15438
15439PyMODINIT_FUNC
15440PyInit__string(void)
15441{
15442 return PyModule_Create(&_string_module);
15443}
15444
15445
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015446#ifdef __cplusplus
15447}
15448#endif