blob: db6a51ca22ae0fd9b7d311ce9242cb7065da2c8f [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
207Py_LOCAL_INLINE(int)
208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0)
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
321 if (strcmp(errors, "surrogateescape") == 0)
322 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner50149202015-09-22 00:26:54 +0200323 if (strcmp(errors, "replace") == 0)
324 return _Py_ERROR_REPLACE;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200325 if (strcmp(errors, "ignore") == 0)
326 return _Py_ERROR_IGNORE;
327 if (strcmp(errors, "backslashreplace") == 0)
328 return _Py_ERROR_BACKSLASHREPLACE;
329 if (strcmp(errors, "surrogatepass") == 0)
330 return _Py_ERROR_SURROGATEPASS;
Victor Stinner50149202015-09-22 00:26:54 +0200331 if (strcmp(errors, "xmlcharrefreplace") == 0)
332 return _Py_ERROR_XMLCHARREFREPLACE;
333 return _Py_ERROR_OTHER;
334}
335
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300336/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
337 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000338Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000339PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000340{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000341#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000342 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000343#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 /* This is actually an illegal character, so it should
345 not be passed to unichr. */
346 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347#endif
348}
349
Victor Stinner910337b2011-10-03 03:20:16 +0200350#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200351int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100352_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200353{
354 PyASCIIObject *ascii;
355 unsigned int kind;
356
357 assert(PyUnicode_Check(op));
358
359 ascii = (PyASCIIObject *)op;
360 kind = ascii->state.kind;
361
Victor Stinnera3b334d2011-10-03 13:53:37 +0200362 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200363 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200364 assert(ascii->state.ready == 1);
365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200367 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200368 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200369
Victor Stinnera41463c2011-10-04 01:05:08 +0200370 if (ascii->state.compact == 1) {
371 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200372 assert(kind == PyUnicode_1BYTE_KIND
373 || kind == PyUnicode_2BYTE_KIND
374 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200375 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200376 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100378 }
379 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200380 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
381
382 data = unicode->data.any;
383 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100384 assert(ascii->length == 0);
385 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200386 assert(ascii->state.compact == 0);
387 assert(ascii->state.ascii == 0);
388 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100389 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200390 assert(ascii->wstr != NULL);
391 assert(data == NULL);
392 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 }
394 else {
395 assert(kind == PyUnicode_1BYTE_KIND
396 || kind == PyUnicode_2BYTE_KIND
397 || kind == PyUnicode_4BYTE_KIND);
398 assert(ascii->state.compact == 0);
399 assert(ascii->state.ready == 1);
400 assert(data != NULL);
401 if (ascii->state.ascii) {
402 assert (compact->utf8 == data);
403 assert (compact->utf8_length == ascii->length);
404 }
405 else
406 assert (compact->utf8 != data);
407 }
408 }
409 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200410 if (
411#if SIZEOF_WCHAR_T == 2
412 kind == PyUnicode_2BYTE_KIND
413#else
414 kind == PyUnicode_4BYTE_KIND
415#endif
416 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200417 {
418 assert(ascii->wstr == data);
419 assert(compact->wstr_length == ascii->length);
420 } else
421 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200422 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200423
424 if (compact->utf8 == NULL)
425 assert(compact->utf8_length == 0);
426 if (ascii->wstr == NULL)
427 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200428 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200429 /* check that the best kind is used */
430 if (check_content && kind != PyUnicode_WCHAR_KIND)
431 {
432 Py_ssize_t i;
433 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200434 void *data;
435 Py_UCS4 ch;
436
437 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200438 for (i=0; i < ascii->length; i++)
439 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200440 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200441 if (ch > maxchar)
442 maxchar = ch;
443 }
444 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100445 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200446 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100447 assert(maxchar <= 255);
448 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200449 else
450 assert(maxchar < 128);
451 }
Victor Stinner77faf692011-11-20 18:56:05 +0100452 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 0xFFFF);
455 }
456 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200457 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100458 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100459 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200460 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200461 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400462 return 1;
463}
Victor Stinner910337b2011-10-03 03:20:16 +0200464#endif
465
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466static PyObject*
467unicode_result_wchar(PyObject *unicode)
468{
469#ifndef Py_DEBUG
470 Py_ssize_t len;
471
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100472 len = _PyUnicode_WSTR_LENGTH(unicode);
473 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100474 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200475 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 }
477
478 if (len == 1) {
479 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100480 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
482 Py_DECREF(unicode);
483 return latin1_char;
484 }
485 }
486
487 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200488 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100489 return NULL;
490 }
491#else
Victor Stinneraa771272012-10-04 02:32:58 +0200492 assert(Py_REFCNT(unicode) == 1);
493
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100494 /* don't make the result ready in debug mode to ensure that the caller
495 makes the string ready before using it */
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497#endif
498 return unicode;
499}
500
501static PyObject*
502unicode_result_ready(PyObject *unicode)
503{
504 Py_ssize_t length;
505
506 length = PyUnicode_GET_LENGTH(unicode);
507 if (length == 0) {
508 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100509 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200510 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100511 }
512 return unicode_empty;
513 }
514
515 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200516 void *data = PyUnicode_DATA(unicode);
517 int kind = PyUnicode_KIND(unicode);
518 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100519 if (ch < 256) {
520 PyObject *latin1_char = unicode_latin1[ch];
521 if (latin1_char != NULL) {
522 if (unicode != latin1_char) {
523 Py_INCREF(latin1_char);
524 Py_DECREF(unicode);
525 }
526 return latin1_char;
527 }
528 else {
529 assert(_PyUnicode_CheckConsistency(unicode, 1));
530 Py_INCREF(unicode);
531 unicode_latin1[ch] = unicode;
532 return unicode;
533 }
534 }
535 }
536
537 assert(_PyUnicode_CheckConsistency(unicode, 1));
538 return unicode;
539}
540
541static PyObject*
542unicode_result(PyObject *unicode)
543{
544 assert(_PyUnicode_CHECK(unicode));
545 if (PyUnicode_IS_READY(unicode))
546 return unicode_result_ready(unicode);
547 else
548 return unicode_result_wchar(unicode);
549}
550
Victor Stinnerc4b49542011-12-11 22:44:26 +0100551static PyObject*
552unicode_result_unchanged(PyObject *unicode)
553{
554 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500555 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100556 return NULL;
557 Py_INCREF(unicode);
558 return unicode;
559 }
560 else
561 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100562 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563}
564
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200565/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
566 ASCII, Latin1, UTF-8, etc. */
567static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200568backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200569 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
570{
Victor Stinnerad771582015-10-09 12:38:53 +0200571 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572 Py_UCS4 ch;
573 enum PyUnicode_Kind kind;
574 void *data;
575
576 assert(PyUnicode_IS_READY(unicode));
577 kind = PyUnicode_KIND(unicode);
578 data = PyUnicode_DATA(unicode);
579
580 size = 0;
581 /* determine replacement size */
582 for (i = collstart; i < collend; ++i) {
583 Py_ssize_t incr;
584
585 ch = PyUnicode_READ(kind, data, i);
586 if (ch < 0x100)
587 incr = 2+2;
588 else if (ch < 0x10000)
589 incr = 2+4;
590 else {
591 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200592 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200593 }
594 if (size > PY_SSIZE_T_MAX - incr) {
595 PyErr_SetString(PyExc_OverflowError,
596 "encoded result is too long for a Python string");
597 return NULL;
598 }
599 size += incr;
600 }
601
Victor Stinnerad771582015-10-09 12:38:53 +0200602 str = _PyBytesWriter_Prepare(writer, str, size);
603 if (str == NULL)
604 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200605
606 /* generate replacement */
607 for (i = collstart; i < collend; ++i) {
608 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200609 *str++ = '\\';
610 if (ch >= 0x00010000) {
611 *str++ = 'U';
612 *str++ = Py_hexdigits[(ch>>28)&0xf];
613 *str++ = Py_hexdigits[(ch>>24)&0xf];
614 *str++ = Py_hexdigits[(ch>>20)&0xf];
615 *str++ = Py_hexdigits[(ch>>16)&0xf];
616 *str++ = Py_hexdigits[(ch>>12)&0xf];
617 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200618 }
Victor Stinner797485e2015-10-09 03:17:30 +0200619 else if (ch >= 0x100) {
620 *str++ = 'u';
621 *str++ = Py_hexdigits[(ch>>12)&0xf];
622 *str++ = Py_hexdigits[(ch>>8)&0xf];
623 }
624 else
625 *str++ = 'x';
626 *str++ = Py_hexdigits[(ch>>4)&0xf];
627 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200628 }
629 return str;
630}
631
632/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
633 ASCII, Latin1, UTF-8, etc. */
634static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200635xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200636 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
637{
Victor Stinnerad771582015-10-09 12:38:53 +0200638 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200639 Py_UCS4 ch;
640 enum PyUnicode_Kind kind;
641 void *data;
642
643 assert(PyUnicode_IS_READY(unicode));
644 kind = PyUnicode_KIND(unicode);
645 data = PyUnicode_DATA(unicode);
646
647 size = 0;
648 /* determine replacement size */
649 for (i = collstart; i < collend; ++i) {
650 Py_ssize_t incr;
651
652 ch = PyUnicode_READ(kind, data, i);
653 if (ch < 10)
654 incr = 2+1+1;
655 else if (ch < 100)
656 incr = 2+2+1;
657 else if (ch < 1000)
658 incr = 2+3+1;
659 else if (ch < 10000)
660 incr = 2+4+1;
661 else if (ch < 100000)
662 incr = 2+5+1;
663 else if (ch < 1000000)
664 incr = 2+6+1;
665 else {
666 assert(ch <= MAX_UNICODE);
667 incr = 2+7+1;
668 }
669 if (size > PY_SSIZE_T_MAX - incr) {
670 PyErr_SetString(PyExc_OverflowError,
671 "encoded result is too long for a Python string");
672 return NULL;
673 }
674 size += incr;
675 }
676
Victor Stinnerad771582015-10-09 12:38:53 +0200677 str = _PyBytesWriter_Prepare(writer, str, size);
678 if (str == NULL)
679 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200680
681 /* generate replacement */
682 for (i = collstart; i < collend; ++i) {
683 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
684 }
685 return str;
686}
687
Thomas Wouters477c8d52006-05-27 19:21:47 +0000688/* --- Bloom Filters ----------------------------------------------------- */
689
690/* stuff to implement simple "bloom filters" for Unicode characters.
691 to keep things simple, we use a single bitmask, using the least 5
692 bits from each unicode characters as the bit index. */
693
694/* the linebreak mask is set up by Unicode_Init below */
695
Antoine Pitrouf068f942010-01-13 14:19:12 +0000696#if LONG_BIT >= 128
697#define BLOOM_WIDTH 128
698#elif LONG_BIT >= 64
699#define BLOOM_WIDTH 64
700#elif LONG_BIT >= 32
701#define BLOOM_WIDTH 32
702#else
703#error "LONG_BIT is smaller than 32"
704#endif
705
Thomas Wouters477c8d52006-05-27 19:21:47 +0000706#define BLOOM_MASK unsigned long
707
Serhiy Storchaka05997252013-01-26 12:14:02 +0200708static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000709
Antoine Pitrouf068f942010-01-13 14:19:12 +0000710#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000711
Benjamin Peterson29060642009-01-31 22:14:21 +0000712#define BLOOM_LINEBREAK(ch) \
713 ((ch) < 128U ? ascii_linebreak[(ch)] : \
714 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000715
Alexander Belopolsky40018472011-02-26 01:02:56 +0000716Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200717make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718{
Victor Stinnera85af502013-04-09 21:53:54 +0200719#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
720 do { \
721 TYPE *data = (TYPE *)PTR; \
722 TYPE *end = data + LEN; \
723 Py_UCS4 ch; \
724 for (; data != end; data++) { \
725 ch = *data; \
726 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
727 } \
728 break; \
729 } while (0)
730
Thomas Wouters477c8d52006-05-27 19:21:47 +0000731 /* calculate simple bloom-style bitmask for a given unicode string */
732
Antoine Pitrouf068f942010-01-13 14:19:12 +0000733 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734
735 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200736 switch (kind) {
737 case PyUnicode_1BYTE_KIND:
738 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
739 break;
740 case PyUnicode_2BYTE_KIND:
741 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
742 break;
743 case PyUnicode_4BYTE_KIND:
744 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
745 break;
746 default:
747 assert(0);
748 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000749 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200750
751#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000752}
753
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300754static int
755ensure_unicode(PyObject *obj)
756{
757 if (!PyUnicode_Check(obj)) {
758 PyErr_Format(PyExc_TypeError,
759 "must be str, not %.100s",
760 Py_TYPE(obj)->tp_name);
761 return -1;
762 }
763 return PyUnicode_READY(obj);
764}
765
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200766/* Compilation of templated routines */
767
768#include "stringlib/asciilib.h"
769#include "stringlib/fastsearch.h"
770#include "stringlib/partition.h"
771#include "stringlib/split.h"
772#include "stringlib/count.h"
773#include "stringlib/find.h"
774#include "stringlib/find_max_char.h"
775#include "stringlib/localeutil.h"
776#include "stringlib/undef.h"
777
778#include "stringlib/ucs1lib.h"
779#include "stringlib/fastsearch.h"
780#include "stringlib/partition.h"
781#include "stringlib/split.h"
782#include "stringlib/count.h"
783#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300784#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200785#include "stringlib/find_max_char.h"
786#include "stringlib/localeutil.h"
787#include "stringlib/undef.h"
788
789#include "stringlib/ucs2lib.h"
790#include "stringlib/fastsearch.h"
791#include "stringlib/partition.h"
792#include "stringlib/split.h"
793#include "stringlib/count.h"
794#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300795#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200796#include "stringlib/find_max_char.h"
797#include "stringlib/localeutil.h"
798#include "stringlib/undef.h"
799
800#include "stringlib/ucs4lib.h"
801#include "stringlib/fastsearch.h"
802#include "stringlib/partition.h"
803#include "stringlib/split.h"
804#include "stringlib/count.h"
805#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300806#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200807#include "stringlib/find_max_char.h"
808#include "stringlib/localeutil.h"
809#include "stringlib/undef.h"
810
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200811#include "stringlib/unicodedefs.h"
812#include "stringlib/fastsearch.h"
813#include "stringlib/count.h"
814#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100815#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200816
Guido van Rossumd57fd912000-03-10 22:53:23 +0000817/* --- Unicode Object ----------------------------------------------------- */
818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200819static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200820fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200822Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823 Py_ssize_t size, Py_UCS4 ch,
824 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200825{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200826 switch (kind) {
827 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200828 if ((Py_UCS1) ch != ch)
829 return -1;
830 if (direction > 0)
831 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
832 else
833 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200835 if ((Py_UCS2) ch != ch)
836 return -1;
837 if (direction > 0)
838 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
839 else
840 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200841 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200842 if (direction > 0)
843 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
844 else
845 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200846 default:
847 assert(0);
848 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200849 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200850}
851
Victor Stinnerafffce42012-10-03 23:03:17 +0200852#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000853/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200854 earlier.
855
856 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
857 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
858 invalid character in Unicode 6.0. */
859static void
860unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
861{
862 int kind = PyUnicode_KIND(unicode);
863 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
864 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
865 if (length <= old_length)
866 return;
867 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
868}
869#endif
870
Victor Stinnerfe226c02011-10-03 03:52:20 +0200871static PyObject*
872resize_compact(PyObject *unicode, Py_ssize_t length)
873{
874 Py_ssize_t char_size;
875 Py_ssize_t struct_size;
876 Py_ssize_t new_size;
877 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100878 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200879#ifdef Py_DEBUG
880 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
881#endif
882
Victor Stinner79891572012-05-03 13:43:07 +0200883 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200884 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100885 assert(PyUnicode_IS_COMPACT(unicode));
886
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200887 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100888 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200889 struct_size = sizeof(PyASCIIObject);
890 else
891 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200892 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200893
Victor Stinnerfe226c02011-10-03 03:52:20 +0200894 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
895 PyErr_NoMemory();
896 return NULL;
897 }
898 new_size = (struct_size + (length + 1) * char_size);
899
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200900 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
901 PyObject_DEL(_PyUnicode_UTF8(unicode));
902 _PyUnicode_UTF8(unicode) = NULL;
903 _PyUnicode_UTF8_LENGTH(unicode) = 0;
904 }
Victor Stinner84def372011-12-11 20:04:56 +0100905 _Py_DEC_REFTOTAL;
906 _Py_ForgetReference(unicode);
907
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300908 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100909 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100910 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200911 PyErr_NoMemory();
912 return NULL;
913 }
Victor Stinner84def372011-12-11 20:04:56 +0100914 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200915 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100916
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200918 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200919 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100920 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200921 _PyUnicode_WSTR_LENGTH(unicode) = length;
922 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100923 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
924 PyObject_DEL(_PyUnicode_WSTR(unicode));
925 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100926 if (!PyUnicode_IS_ASCII(unicode))
927 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100928 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200929#ifdef Py_DEBUG
930 unicode_fill_invalid(unicode, old_length);
931#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200932 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
933 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200934 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 return unicode;
936}
937
Alexander Belopolsky40018472011-02-26 01:02:56 +0000938static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200939resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000940{
Victor Stinner95663112011-10-04 01:03:50 +0200941 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100942 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200943 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200944 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000945
Victor Stinnerfe226c02011-10-03 03:52:20 +0200946 if (PyUnicode_IS_READY(unicode)) {
947 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200948 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200949 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200950#ifdef Py_DEBUG
951 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
952#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200953
954 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200955 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200956 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
957 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200958
959 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
960 PyErr_NoMemory();
961 return -1;
962 }
963 new_size = (length + 1) * char_size;
964
Victor Stinner7a9105a2011-12-12 00:13:42 +0100965 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
966 {
967 PyObject_DEL(_PyUnicode_UTF8(unicode));
968 _PyUnicode_UTF8(unicode) = NULL;
969 _PyUnicode_UTF8_LENGTH(unicode) = 0;
970 }
971
Victor Stinnerfe226c02011-10-03 03:52:20 +0200972 data = (PyObject *)PyObject_REALLOC(data, new_size);
973 if (data == NULL) {
974 PyErr_NoMemory();
975 return -1;
976 }
977 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200978 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200979 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200980 _PyUnicode_WSTR_LENGTH(unicode) = length;
981 }
982 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200983 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200984 _PyUnicode_UTF8_LENGTH(unicode) = length;
985 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200986 _PyUnicode_LENGTH(unicode) = length;
987 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200988#ifdef Py_DEBUG
989 unicode_fill_invalid(unicode, old_length);
990#endif
Victor Stinner95663112011-10-04 01:03:50 +0200991 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200992 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200993 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200995 }
Victor Stinner95663112011-10-04 01:03:50 +0200996 assert(_PyUnicode_WSTR(unicode) != NULL);
997
998 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700999 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001000 PyErr_NoMemory();
1001 return -1;
1002 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001003 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001004 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001005 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001006 if (!wstr) {
1007 PyErr_NoMemory();
1008 return -1;
1009 }
1010 _PyUnicode_WSTR(unicode) = wstr;
1011 _PyUnicode_WSTR(unicode)[length] = 0;
1012 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001013 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001014 return 0;
1015}
1016
Victor Stinnerfe226c02011-10-03 03:52:20 +02001017static PyObject*
1018resize_copy(PyObject *unicode, Py_ssize_t length)
1019{
1020 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001021 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001022 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001023
Benjamin Petersonbac79492012-01-14 13:34:47 -05001024 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001025 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001026
1027 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1028 if (copy == NULL)
1029 return NULL;
1030
1031 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001032 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001033 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001034 }
1035 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001036 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001037
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001038 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001039 if (w == NULL)
1040 return NULL;
1041 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1042 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001043 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1044 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001045 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 }
1047}
1048
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001050 Ux0000 terminated; some code (e.g. new_identifier)
1051 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052
1053 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001054 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001055
1056*/
1057
Alexander Belopolsky40018472011-02-26 01:02:56 +00001058static PyUnicodeObject *
1059_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001061 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063
Thomas Wouters477c8d52006-05-27 19:21:47 +00001064 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001065 if (length == 0 && unicode_empty != NULL) {
1066 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001067 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068 }
1069
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001070 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001071 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001072 return (PyUnicodeObject *)PyErr_NoMemory();
1073 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074 if (length < 0) {
1075 PyErr_SetString(PyExc_SystemError,
1076 "Negative size passed to _PyUnicode_New");
1077 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078 }
1079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1081 if (unicode == NULL)
1082 return NULL;
1083 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001084
1085 _PyUnicode_WSTR_LENGTH(unicode) = length;
1086 _PyUnicode_HASH(unicode) = -1;
1087 _PyUnicode_STATE(unicode).interned = 0;
1088 _PyUnicode_STATE(unicode).kind = 0;
1089 _PyUnicode_STATE(unicode).compact = 0;
1090 _PyUnicode_STATE(unicode).ready = 0;
1091 _PyUnicode_STATE(unicode).ascii = 0;
1092 _PyUnicode_DATA_ANY(unicode) = NULL;
1093 _PyUnicode_LENGTH(unicode) = 0;
1094 _PyUnicode_UTF8(unicode) = NULL;
1095 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1098 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001099 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001100 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001101 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001102 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103
Jeremy Hyltond8082792003-09-16 19:41:39 +00001104 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001105 * the caller fails before initializing str -- unicode_resize()
1106 * reads str[0], and the Keep-Alive optimization can keep memory
1107 * allocated for str alive across a call to unicode_dealloc(unicode).
1108 * We don't want unicode_resize to read uninitialized memory in
1109 * that case.
1110 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111 _PyUnicode_WSTR(unicode)[0] = 0;
1112 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001113
Victor Stinner7931d9a2011-11-04 00:22:48 +01001114 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 return unicode;
1116}
1117
Victor Stinnerf42dc442011-10-02 23:33:16 +02001118static const char*
1119unicode_kind_name(PyObject *unicode)
1120{
Victor Stinner42dfd712011-10-03 14:41:45 +02001121 /* don't check consistency: unicode_kind_name() is called from
1122 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001123 if (!PyUnicode_IS_COMPACT(unicode))
1124 {
1125 if (!PyUnicode_IS_READY(unicode))
1126 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001127 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001128 {
1129 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001130 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131 return "legacy ascii";
1132 else
1133 return "legacy latin1";
1134 case PyUnicode_2BYTE_KIND:
1135 return "legacy UCS2";
1136 case PyUnicode_4BYTE_KIND:
1137 return "legacy UCS4";
1138 default:
1139 return "<legacy invalid kind>";
1140 }
1141 }
1142 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001143 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001144 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001145 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001146 return "ascii";
1147 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001148 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001149 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001150 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001152 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001153 default:
1154 return "<invalid compact kind>";
1155 }
1156}
1157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001158#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159/* Functions wrapping macros for use in debugger */
1160char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001161 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162}
1163
1164void *_PyUnicode_compact_data(void *unicode) {
1165 return _PyUnicode_COMPACT_DATA(unicode);
1166}
1167void *_PyUnicode_data(void *unicode){
1168 printf("obj %p\n", unicode);
1169 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1170 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1171 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1172 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1173 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1174 return PyUnicode_DATA(unicode);
1175}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001176
1177void
1178_PyUnicode_Dump(PyObject *op)
1179{
1180 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001181 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1182 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1183 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001184
Victor Stinnera849a4b2011-10-03 12:12:11 +02001185 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001186 {
1187 if (ascii->state.ascii)
1188 data = (ascii + 1);
1189 else
1190 data = (compact + 1);
1191 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001192 else
1193 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001194 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1195 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001196
Victor Stinnera849a4b2011-10-03 12:12:11 +02001197 if (ascii->wstr == data)
1198 printf("shared ");
1199 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001200
Victor Stinnera3b334d2011-10-03 13:53:37 +02001201 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001202 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001203 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1204 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001205 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1206 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001207 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001208 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001209}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001210#endif
1211
1212PyObject *
1213PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1214{
1215 PyObject *obj;
1216 PyCompactUnicodeObject *unicode;
1217 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001218 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001219 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001220 Py_ssize_t char_size;
1221 Py_ssize_t struct_size;
1222
1223 /* Optimization for empty strings */
1224 if (size == 0 && unicode_empty != NULL) {
1225 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001226 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001227 }
1228
Victor Stinner9e9d6892011-10-04 01:02:02 +02001229 is_ascii = 0;
1230 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001231 struct_size = sizeof(PyCompactUnicodeObject);
1232 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001233 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234 char_size = 1;
1235 is_ascii = 1;
1236 struct_size = sizeof(PyASCIIObject);
1237 }
1238 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001239 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001240 char_size = 1;
1241 }
1242 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001243 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001244 char_size = 2;
1245 if (sizeof(wchar_t) == 2)
1246 is_sharing = 1;
1247 }
1248 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001249 if (maxchar > MAX_UNICODE) {
1250 PyErr_SetString(PyExc_SystemError,
1251 "invalid maximum character passed to PyUnicode_New");
1252 return NULL;
1253 }
Victor Stinner8f825062012-04-27 13:55:39 +02001254 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001255 char_size = 4;
1256 if (sizeof(wchar_t) == 4)
1257 is_sharing = 1;
1258 }
1259
1260 /* Ensure we won't overflow the size. */
1261 if (size < 0) {
1262 PyErr_SetString(PyExc_SystemError,
1263 "Negative size passed to PyUnicode_New");
1264 return NULL;
1265 }
1266 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1267 return PyErr_NoMemory();
1268
1269 /* Duplicated allocation code from _PyObject_New() instead of a call to
1270 * PyObject_New() so we are able to allocate space for the object and
1271 * it's data buffer.
1272 */
1273 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1274 if (obj == NULL)
1275 return PyErr_NoMemory();
1276 obj = PyObject_INIT(obj, &PyUnicode_Type);
1277 if (obj == NULL)
1278 return NULL;
1279
1280 unicode = (PyCompactUnicodeObject *)obj;
1281 if (is_ascii)
1282 data = ((PyASCIIObject*)obj) + 1;
1283 else
1284 data = unicode + 1;
1285 _PyUnicode_LENGTH(unicode) = size;
1286 _PyUnicode_HASH(unicode) = -1;
1287 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001288 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289 _PyUnicode_STATE(unicode).compact = 1;
1290 _PyUnicode_STATE(unicode).ready = 1;
1291 _PyUnicode_STATE(unicode).ascii = is_ascii;
1292 if (is_ascii) {
1293 ((char*)data)[size] = 0;
1294 _PyUnicode_WSTR(unicode) = NULL;
1295 }
Victor Stinner8f825062012-04-27 13:55:39 +02001296 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 ((char*)data)[size] = 0;
1298 _PyUnicode_WSTR(unicode) = NULL;
1299 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001300 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001301 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001302 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001303 else {
1304 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001305 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001306 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001308 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309 ((Py_UCS4*)data)[size] = 0;
1310 if (is_sharing) {
1311 _PyUnicode_WSTR_LENGTH(unicode) = size;
1312 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1313 }
1314 else {
1315 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1316 _PyUnicode_WSTR(unicode) = NULL;
1317 }
1318 }
Victor Stinner8f825062012-04-27 13:55:39 +02001319#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001320 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001321#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001322 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 return obj;
1324}
1325
1326#if SIZEOF_WCHAR_T == 2
1327/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1328 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001329 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330
1331 This function assumes that unicode can hold one more code point than wstr
1332 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001333static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001335 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336{
1337 const wchar_t *iter;
1338 Py_UCS4 *ucs4_out;
1339
Victor Stinner910337b2011-10-03 03:20:16 +02001340 assert(unicode != NULL);
1341 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1343 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1344
1345 for (iter = begin; iter < end; ) {
1346 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1347 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001348 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1349 && (iter+1) < end
1350 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 {
Victor Stinner551ac952011-11-29 22:58:13 +01001352 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353 iter += 2;
1354 }
1355 else {
1356 *ucs4_out++ = *iter;
1357 iter++;
1358 }
1359 }
1360 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1361 _PyUnicode_GET_LENGTH(unicode)));
1362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363}
1364#endif
1365
Victor Stinnercd9950f2011-10-02 00:34:53 +02001366static int
Victor Stinner488fa492011-12-12 00:01:39 +01001367unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001368{
Victor Stinner488fa492011-12-12 00:01:39 +01001369 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001370 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001371 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001372 return -1;
1373 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001374 return 0;
1375}
1376
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001377static int
1378_copy_characters(PyObject *to, Py_ssize_t to_start,
1379 PyObject *from, Py_ssize_t from_start,
1380 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001382 unsigned int from_kind, to_kind;
1383 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384
Victor Stinneree4544c2012-05-09 22:24:08 +02001385 assert(0 <= how_many);
1386 assert(0 <= from_start);
1387 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001388 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001389 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001390 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391
Victor Stinnerd3f08822012-05-29 12:57:52 +02001392 assert(PyUnicode_Check(to));
1393 assert(PyUnicode_IS_READY(to));
1394 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1395
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001396 if (how_many == 0)
1397 return 0;
1398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001400 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001402 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403
Victor Stinnerf1852262012-06-16 16:38:26 +02001404#ifdef Py_DEBUG
1405 if (!check_maxchar
1406 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1407 {
1408 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1409 Py_UCS4 ch;
1410 Py_ssize_t i;
1411 for (i=0; i < how_many; i++) {
1412 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1413 assert(ch <= to_maxchar);
1414 }
1415 }
1416#endif
1417
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001418 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001419 if (check_maxchar
1420 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1421 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001422 /* Writing Latin-1 characters into an ASCII string requires to
1423 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001424 Py_UCS4 max_char;
1425 max_char = ucs1lib_find_max_char(from_data,
1426 (Py_UCS1*)from_data + how_many);
1427 if (max_char >= 128)
1428 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001429 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001430 Py_MEMCPY((char*)to_data + to_kind * to_start,
1431 (char*)from_data + from_kind * from_start,
1432 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001434 else if (from_kind == PyUnicode_1BYTE_KIND
1435 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001436 {
1437 _PyUnicode_CONVERT_BYTES(
1438 Py_UCS1, Py_UCS2,
1439 PyUnicode_1BYTE_DATA(from) + from_start,
1440 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1441 PyUnicode_2BYTE_DATA(to) + to_start
1442 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001443 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001444 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001445 && to_kind == PyUnicode_4BYTE_KIND)
1446 {
1447 _PyUnicode_CONVERT_BYTES(
1448 Py_UCS1, Py_UCS4,
1449 PyUnicode_1BYTE_DATA(from) + from_start,
1450 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1451 PyUnicode_4BYTE_DATA(to) + to_start
1452 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001453 }
1454 else if (from_kind == PyUnicode_2BYTE_KIND
1455 && to_kind == PyUnicode_4BYTE_KIND)
1456 {
1457 _PyUnicode_CONVERT_BYTES(
1458 Py_UCS2, Py_UCS4,
1459 PyUnicode_2BYTE_DATA(from) + from_start,
1460 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1461 PyUnicode_4BYTE_DATA(to) + to_start
1462 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001463 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001464 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001465 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1466
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001467 if (!check_maxchar) {
1468 if (from_kind == PyUnicode_2BYTE_KIND
1469 && to_kind == PyUnicode_1BYTE_KIND)
1470 {
1471 _PyUnicode_CONVERT_BYTES(
1472 Py_UCS2, Py_UCS1,
1473 PyUnicode_2BYTE_DATA(from) + from_start,
1474 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1475 PyUnicode_1BYTE_DATA(to) + to_start
1476 );
1477 }
1478 else if (from_kind == PyUnicode_4BYTE_KIND
1479 && to_kind == PyUnicode_1BYTE_KIND)
1480 {
1481 _PyUnicode_CONVERT_BYTES(
1482 Py_UCS4, Py_UCS1,
1483 PyUnicode_4BYTE_DATA(from) + from_start,
1484 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1485 PyUnicode_1BYTE_DATA(to) + to_start
1486 );
1487 }
1488 else if (from_kind == PyUnicode_4BYTE_KIND
1489 && to_kind == PyUnicode_2BYTE_KIND)
1490 {
1491 _PyUnicode_CONVERT_BYTES(
1492 Py_UCS4, Py_UCS2,
1493 PyUnicode_4BYTE_DATA(from) + from_start,
1494 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1495 PyUnicode_2BYTE_DATA(to) + to_start
1496 );
1497 }
1498 else {
1499 assert(0);
1500 return -1;
1501 }
1502 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001503 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001504 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001505 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001506 Py_ssize_t i;
1507
Victor Stinnera0702ab2011-09-29 14:14:38 +02001508 for (i=0; i < how_many; i++) {
1509 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001510 if (ch > to_maxchar)
1511 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1513 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001514 }
1515 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001516 return 0;
1517}
1518
Victor Stinnerd3f08822012-05-29 12:57:52 +02001519void
1520_PyUnicode_FastCopyCharacters(
1521 PyObject *to, Py_ssize_t to_start,
1522 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001523{
1524 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1525}
1526
1527Py_ssize_t
1528PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1529 PyObject *from, Py_ssize_t from_start,
1530 Py_ssize_t how_many)
1531{
1532 int err;
1533
1534 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1535 PyErr_BadInternalCall();
1536 return -1;
1537 }
1538
Benjamin Petersonbac79492012-01-14 13:34:47 -05001539 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001540 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001541 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001542 return -1;
1543
Victor Stinnerd3f08822012-05-29 12:57:52 +02001544 if (from_start < 0) {
1545 PyErr_SetString(PyExc_IndexError, "string index out of range");
1546 return -1;
1547 }
1548 if (to_start < 0) {
1549 PyErr_SetString(PyExc_IndexError, "string index out of range");
1550 return -1;
1551 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001552 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1553 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1554 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001555 "Cannot write %zi characters at %zi "
1556 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001557 how_many, to_start, PyUnicode_GET_LENGTH(to));
1558 return -1;
1559 }
1560
1561 if (how_many == 0)
1562 return 0;
1563
Victor Stinner488fa492011-12-12 00:01:39 +01001564 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 return -1;
1566
1567 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1568 if (err) {
1569 PyErr_Format(PyExc_SystemError,
1570 "Cannot copy %s characters "
1571 "into a string of %s characters",
1572 unicode_kind_name(from),
1573 unicode_kind_name(to));
1574 return -1;
1575 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001576 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001577}
1578
Victor Stinner17222162011-09-28 22:15:37 +02001579/* Find the maximum code point and count the number of surrogate pairs so a
1580 correct string length can be computed before converting a string to UCS4.
1581 This function counts single surrogates as a character and not as a pair.
1582
1583 Return 0 on success, or -1 on error. */
1584static int
1585find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1586 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001587{
1588 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001589 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001590
Victor Stinnerc53be962011-10-02 21:33:54 +02001591 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001592 *num_surrogates = 0;
1593 *maxchar = 0;
1594
1595 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001596#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001597 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1598 && (iter+1) < end
1599 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1600 {
1601 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1602 ++(*num_surrogates);
1603 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604 }
1605 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001606#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001607 {
1608 ch = *iter;
1609 iter++;
1610 }
1611 if (ch > *maxchar) {
1612 *maxchar = ch;
1613 if (*maxchar > MAX_UNICODE) {
1614 PyErr_Format(PyExc_ValueError,
1615 "character U+%x is not in range [U+0000; U+10ffff]",
1616 ch);
1617 return -1;
1618 }
1619 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001620 }
1621 return 0;
1622}
1623
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001624int
1625_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001626{
1627 wchar_t *end;
1628 Py_UCS4 maxchar = 0;
1629 Py_ssize_t num_surrogates;
1630#if SIZEOF_WCHAR_T == 2
1631 Py_ssize_t length_wo_surrogates;
1632#endif
1633
Georg Brandl7597add2011-10-05 16:36:47 +02001634 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001635 strings were created using _PyObject_New() and where no canonical
1636 representation (the str field) has been set yet aka strings
1637 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001638 assert(_PyUnicode_CHECK(unicode));
1639 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001641 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001642 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001643 /* Actually, it should neither be interned nor be anything else: */
1644 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001647 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001648 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001649 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650
1651 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001652 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1653 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001654 PyErr_NoMemory();
1655 return -1;
1656 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001657 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 _PyUnicode_WSTR(unicode), end,
1659 PyUnicode_1BYTE_DATA(unicode));
1660 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1661 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1662 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1663 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001664 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001665 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001666 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 }
1668 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001669 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001670 _PyUnicode_UTF8(unicode) = NULL;
1671 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 }
1673 PyObject_FREE(_PyUnicode_WSTR(unicode));
1674 _PyUnicode_WSTR(unicode) = NULL;
1675 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1676 }
1677 /* In this case we might have to convert down from 4-byte native
1678 wchar_t to 2-byte unicode. */
1679 else if (maxchar < 65536) {
1680 assert(num_surrogates == 0 &&
1681 "FindMaxCharAndNumSurrogatePairs() messed up");
1682
Victor Stinner506f5922011-09-28 22:34:18 +02001683#if SIZEOF_WCHAR_T == 2
1684 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001685 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001686 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1687 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1688 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001689 _PyUnicode_UTF8(unicode) = NULL;
1690 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001691#else
1692 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001693 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001694 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001695 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001696 PyErr_NoMemory();
1697 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 }
Victor Stinner506f5922011-09-28 22:34:18 +02001699 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1700 _PyUnicode_WSTR(unicode), end,
1701 PyUnicode_2BYTE_DATA(unicode));
1702 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1703 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1704 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001705 _PyUnicode_UTF8(unicode) = NULL;
1706 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001707 PyObject_FREE(_PyUnicode_WSTR(unicode));
1708 _PyUnicode_WSTR(unicode) = NULL;
1709 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1710#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001711 }
1712 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1713 else {
1714#if SIZEOF_WCHAR_T == 2
1715 /* in case the native representation is 2-bytes, we need to allocate a
1716 new normalized 4-byte version. */
1717 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001718 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1719 PyErr_NoMemory();
1720 return -1;
1721 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001722 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1723 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724 PyErr_NoMemory();
1725 return -1;
1726 }
1727 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1728 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001729 _PyUnicode_UTF8(unicode) = NULL;
1730 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001731 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1732 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001733 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734 PyObject_FREE(_PyUnicode_WSTR(unicode));
1735 _PyUnicode_WSTR(unicode) = NULL;
1736 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1737#else
1738 assert(num_surrogates == 0);
1739
Victor Stinnerc3c74152011-10-02 20:39:55 +02001740 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001742 _PyUnicode_UTF8(unicode) = NULL;
1743 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1745#endif
1746 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1747 }
1748 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001749 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 return 0;
1751}
1752
Alexander Belopolsky40018472011-02-26 01:02:56 +00001753static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001754unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755{
Walter Dörwald16807132007-05-25 13:52:07 +00001756 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001757 case SSTATE_NOT_INTERNED:
1758 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001759
Benjamin Peterson29060642009-01-31 22:14:21 +00001760 case SSTATE_INTERNED_MORTAL:
1761 /* revive dead object temporarily for DelItem */
1762 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001763 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001764 Py_FatalError(
1765 "deletion of interned string failed");
1766 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001767
Benjamin Peterson29060642009-01-31 22:14:21 +00001768 case SSTATE_INTERNED_IMMORTAL:
1769 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001770
Benjamin Peterson29060642009-01-31 22:14:21 +00001771 default:
1772 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001773 }
1774
Victor Stinner03490912011-10-03 23:45:12 +02001775 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001777 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001778 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001779 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1780 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001782 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783}
1784
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001785#ifdef Py_DEBUG
1786static int
1787unicode_is_singleton(PyObject *unicode)
1788{
1789 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1790 if (unicode == unicode_empty)
1791 return 1;
1792 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1793 {
1794 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1795 if (ch < 256 && unicode_latin1[ch] == unicode)
1796 return 1;
1797 }
1798 return 0;
1799}
1800#endif
1801
Alexander Belopolsky40018472011-02-26 01:02:56 +00001802static int
Victor Stinner488fa492011-12-12 00:01:39 +01001803unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001804{
Victor Stinner488fa492011-12-12 00:01:39 +01001805 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001806 if (Py_REFCNT(unicode) != 1)
1807 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001808 if (_PyUnicode_HASH(unicode) != -1)
1809 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001810 if (PyUnicode_CHECK_INTERNED(unicode))
1811 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001812 if (!PyUnicode_CheckExact(unicode))
1813 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001814#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001815 /* singleton refcount is greater than 1 */
1816 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001817#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001818 return 1;
1819}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001820
Victor Stinnerfe226c02011-10-03 03:52:20 +02001821static int
1822unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1823{
1824 PyObject *unicode;
1825 Py_ssize_t old_length;
1826
1827 assert(p_unicode != NULL);
1828 unicode = *p_unicode;
1829
1830 assert(unicode != NULL);
1831 assert(PyUnicode_Check(unicode));
1832 assert(0 <= length);
1833
Victor Stinner910337b2011-10-03 03:20:16 +02001834 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001835 old_length = PyUnicode_WSTR_LENGTH(unicode);
1836 else
1837 old_length = PyUnicode_GET_LENGTH(unicode);
1838 if (old_length == length)
1839 return 0;
1840
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001841 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001842 _Py_INCREF_UNICODE_EMPTY();
1843 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001844 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001845 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001846 return 0;
1847 }
1848
Victor Stinner488fa492011-12-12 00:01:39 +01001849 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001850 PyObject *copy = resize_copy(unicode, length);
1851 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001852 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001853 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001854 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001855 }
1856
Victor Stinnerfe226c02011-10-03 03:52:20 +02001857 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001858 PyObject *new_unicode = resize_compact(unicode, length);
1859 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001860 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001861 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001863 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001864 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001865}
1866
Alexander Belopolsky40018472011-02-26 01:02:56 +00001867int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001868PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001869{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001870 PyObject *unicode;
1871 if (p_unicode == NULL) {
1872 PyErr_BadInternalCall();
1873 return -1;
1874 }
1875 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001876 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001877 {
1878 PyErr_BadInternalCall();
1879 return -1;
1880 }
1881 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001882}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001883
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001884/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001885
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001886 WARNING: The function doesn't copy the terminating null character and
1887 doesn't check the maximum character (may write a latin1 character in an
1888 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001889static void
1890unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1891 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001892{
1893 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1894 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001895 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001896
1897 switch (kind) {
1898 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001899 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001900#ifdef Py_DEBUG
1901 if (PyUnicode_IS_ASCII(unicode)) {
1902 Py_UCS4 maxchar = ucs1lib_find_max_char(
1903 (const Py_UCS1*)str,
1904 (const Py_UCS1*)str + len);
1905 assert(maxchar < 128);
1906 }
1907#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001908 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001909 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001910 }
1911 case PyUnicode_2BYTE_KIND: {
1912 Py_UCS2 *start = (Py_UCS2 *)data + index;
1913 Py_UCS2 *ucs2 = start;
1914 assert(index <= PyUnicode_GET_LENGTH(unicode));
1915
Victor Stinner184252a2012-06-16 02:57:41 +02001916 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001917 *ucs2 = (Py_UCS2)*str;
1918
1919 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001920 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001921 }
1922 default: {
1923 Py_UCS4 *start = (Py_UCS4 *)data + index;
1924 Py_UCS4 *ucs4 = start;
1925 assert(kind == PyUnicode_4BYTE_KIND);
1926 assert(index <= PyUnicode_GET_LENGTH(unicode));
1927
Victor Stinner184252a2012-06-16 02:57:41 +02001928 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001929 *ucs4 = (Py_UCS4)*str;
1930
1931 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001932 }
1933 }
1934}
1935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936static PyObject*
1937get_latin1_char(unsigned char ch)
1938{
Victor Stinnera464fc12011-10-02 20:39:30 +02001939 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001941 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001942 if (!unicode)
1943 return NULL;
1944 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001945 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001946 unicode_latin1[ch] = unicode;
1947 }
1948 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001949 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950}
1951
Victor Stinner985a82a2014-01-03 12:53:47 +01001952static PyObject*
1953unicode_char(Py_UCS4 ch)
1954{
1955 PyObject *unicode;
1956
1957 assert(ch <= MAX_UNICODE);
1958
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001959 if (ch < 256)
1960 return get_latin1_char(ch);
1961
Victor Stinner985a82a2014-01-03 12:53:47 +01001962 unicode = PyUnicode_New(1, ch);
1963 if (unicode == NULL)
1964 return NULL;
1965 switch (PyUnicode_KIND(unicode)) {
1966 case PyUnicode_1BYTE_KIND:
1967 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1968 break;
1969 case PyUnicode_2BYTE_KIND:
1970 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1971 break;
1972 default:
1973 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1974 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1975 }
1976 assert(_PyUnicode_CheckConsistency(unicode, 1));
1977 return unicode;
1978}
1979
Alexander Belopolsky40018472011-02-26 01:02:56 +00001980PyObject *
1981PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001983 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984 Py_UCS4 maxchar = 0;
1985 Py_ssize_t num_surrogates;
1986
1987 if (u == NULL)
1988 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001990 /* If the Unicode data is known at construction time, we can apply
1991 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001993 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001994 if (size == 0)
1995 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 /* Single character Unicode objects in the Latin-1 range are
1998 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001999 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 return get_latin1_char((unsigned char)*u);
2001
2002 /* If not empty and not single character, copy the Unicode data
2003 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002004 if (find_maxchar_surrogates(u, u + size,
2005 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002006 return NULL;
2007
Victor Stinner8faf8212011-12-08 22:14:11 +01002008 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009 if (!unicode)
2010 return NULL;
2011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 switch (PyUnicode_KIND(unicode)) {
2013 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002014 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002015 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2016 break;
2017 case PyUnicode_2BYTE_KIND:
2018#if Py_UNICODE_SIZE == 2
2019 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2020#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002021 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2023#endif
2024 break;
2025 case PyUnicode_4BYTE_KIND:
2026#if SIZEOF_WCHAR_T == 2
2027 /* This is the only case which has to process surrogates, thus
2028 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002029 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030#else
2031 assert(num_surrogates == 0);
2032 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2033#endif
2034 break;
2035 default:
2036 assert(0 && "Impossible state");
2037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002039 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040}
2041
Alexander Belopolsky40018472011-02-26 01:02:56 +00002042PyObject *
2043PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002044{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002045 if (size < 0) {
2046 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002047 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002048 return NULL;
2049 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002050 if (u != NULL)
2051 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2052 else
2053 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002054}
2055
Alexander Belopolsky40018472011-02-26 01:02:56 +00002056PyObject *
2057PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002058{
2059 size_t size = strlen(u);
2060 if (size > PY_SSIZE_T_MAX) {
2061 PyErr_SetString(PyExc_OverflowError, "input too long");
2062 return NULL;
2063 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002064 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002065}
2066
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002067PyObject *
2068_PyUnicode_FromId(_Py_Identifier *id)
2069{
2070 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002071 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2072 strlen(id->string),
2073 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002074 if (!id->object)
2075 return NULL;
2076 PyUnicode_InternInPlace(&id->object);
2077 assert(!id->next);
2078 id->next = static_strings;
2079 static_strings = id;
2080 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002081 return id->object;
2082}
2083
2084void
2085_PyUnicode_ClearStaticStrings()
2086{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002087 _Py_Identifier *tmp, *s = static_strings;
2088 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002089 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002090 tmp = s->next;
2091 s->next = NULL;
2092 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002093 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002094 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002095}
2096
Benjamin Peterson0df54292012-03-26 14:50:32 -04002097/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002098
Victor Stinnerd3f08822012-05-29 12:57:52 +02002099PyObject*
2100_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002101{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002102 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002103 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002104 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002105#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002106 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002107#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002108 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002109 }
Victor Stinner785938e2011-12-11 20:09:03 +01002110 unicode = PyUnicode_New(size, 127);
2111 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002112 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002113 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2114 assert(_PyUnicode_CheckConsistency(unicode, 1));
2115 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002116}
2117
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002118static Py_UCS4
2119kind_maxchar_limit(unsigned int kind)
2120{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002121 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002122 case PyUnicode_1BYTE_KIND:
2123 return 0x80;
2124 case PyUnicode_2BYTE_KIND:
2125 return 0x100;
2126 case PyUnicode_4BYTE_KIND:
2127 return 0x10000;
2128 default:
2129 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002130 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002131 }
2132}
2133
Victor Stinnere6abb482012-05-02 01:15:40 +02002134Py_LOCAL_INLINE(Py_UCS4)
2135align_maxchar(Py_UCS4 maxchar)
2136{
2137 if (maxchar <= 127)
2138 return 127;
2139 else if (maxchar <= 255)
2140 return 255;
2141 else if (maxchar <= 65535)
2142 return 65535;
2143 else
2144 return MAX_UNICODE;
2145}
2146
Victor Stinner702c7342011-10-05 13:50:52 +02002147static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002148_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002149{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002150 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002151 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002152
Serhiy Storchaka678db842013-01-26 12:16:36 +02002153 if (size == 0)
2154 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002155 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002156 if (size == 1)
2157 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002158
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002159 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002160 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002161 if (!res)
2162 return NULL;
2163 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002164 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002166}
2167
Victor Stinnere57b1c02011-09-28 22:20:48 +02002168static PyObject*
2169_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170{
2171 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002172 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002173
Serhiy Storchaka678db842013-01-26 12:16:36 +02002174 if (size == 0)
2175 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002176 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002177 if (size == 1)
2178 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002179
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002180 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002181 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182 if (!res)
2183 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002184 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002186 else {
2187 _PyUnicode_CONVERT_BYTES(
2188 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2189 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002190 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002191 return res;
2192}
2193
Victor Stinnere57b1c02011-09-28 22:20:48 +02002194static PyObject*
2195_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196{
2197 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002198 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002199
Serhiy Storchaka678db842013-01-26 12:16:36 +02002200 if (size == 0)
2201 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002202 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002203 if (size == 1)
2204 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002205
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002206 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002207 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 if (!res)
2209 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002210 if (max_char < 256)
2211 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2212 PyUnicode_1BYTE_DATA(res));
2213 else if (max_char < 0x10000)
2214 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2215 PyUnicode_2BYTE_DATA(res));
2216 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002218 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 return res;
2220}
2221
2222PyObject*
2223PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2224{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002225 if (size < 0) {
2226 PyErr_SetString(PyExc_ValueError, "size must be positive");
2227 return NULL;
2228 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002229 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002230 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002231 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002233 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002235 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002236 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002237 PyErr_SetString(PyExc_SystemError, "invalid kind");
2238 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240}
2241
Victor Stinnerece58de2012-04-23 23:36:38 +02002242Py_UCS4
2243_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2244{
2245 enum PyUnicode_Kind kind;
2246 void *startptr, *endptr;
2247
2248 assert(PyUnicode_IS_READY(unicode));
2249 assert(0 <= start);
2250 assert(end <= PyUnicode_GET_LENGTH(unicode));
2251 assert(start <= end);
2252
2253 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2254 return PyUnicode_MAX_CHAR_VALUE(unicode);
2255
2256 if (start == end)
2257 return 127;
2258
Victor Stinner94d558b2012-04-27 22:26:58 +02002259 if (PyUnicode_IS_ASCII(unicode))
2260 return 127;
2261
Victor Stinnerece58de2012-04-23 23:36:38 +02002262 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002263 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002264 endptr = (char *)startptr + end * kind;
2265 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002266 switch(kind) {
2267 case PyUnicode_1BYTE_KIND:
2268 return ucs1lib_find_max_char(startptr, endptr);
2269 case PyUnicode_2BYTE_KIND:
2270 return ucs2lib_find_max_char(startptr, endptr);
2271 case PyUnicode_4BYTE_KIND:
2272 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002273 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002274 assert(0);
2275 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002276 }
2277}
2278
Victor Stinner25a4b292011-10-06 12:31:55 +02002279/* Ensure that a string uses the most efficient storage, if it is not the
2280 case: create a new string with of the right kind. Write NULL into *p_unicode
2281 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002282static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002283unicode_adjust_maxchar(PyObject **p_unicode)
2284{
2285 PyObject *unicode, *copy;
2286 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002287 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002288 unsigned int kind;
2289
2290 assert(p_unicode != NULL);
2291 unicode = *p_unicode;
2292 assert(PyUnicode_IS_READY(unicode));
2293 if (PyUnicode_IS_ASCII(unicode))
2294 return;
2295
2296 len = PyUnicode_GET_LENGTH(unicode);
2297 kind = PyUnicode_KIND(unicode);
2298 if (kind == PyUnicode_1BYTE_KIND) {
2299 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002300 max_char = ucs1lib_find_max_char(u, u + len);
2301 if (max_char >= 128)
2302 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002303 }
2304 else if (kind == PyUnicode_2BYTE_KIND) {
2305 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002306 max_char = ucs2lib_find_max_char(u, u + len);
2307 if (max_char >= 256)
2308 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002309 }
2310 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002311 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002312 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002313 max_char = ucs4lib_find_max_char(u, u + len);
2314 if (max_char >= 0x10000)
2315 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002316 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002317 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002318 if (copy != NULL)
2319 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002320 Py_DECREF(unicode);
2321 *p_unicode = copy;
2322}
2323
Victor Stinner034f6cf2011-09-30 02:26:44 +02002324PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002325_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002326{
Victor Stinner87af4f22011-11-21 23:03:47 +01002327 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002328 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002329
Victor Stinner034f6cf2011-09-30 02:26:44 +02002330 if (!PyUnicode_Check(unicode)) {
2331 PyErr_BadInternalCall();
2332 return NULL;
2333 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002334 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002335 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002336
Victor Stinner87af4f22011-11-21 23:03:47 +01002337 length = PyUnicode_GET_LENGTH(unicode);
2338 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002339 if (!copy)
2340 return NULL;
2341 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2342
Victor Stinner87af4f22011-11-21 23:03:47 +01002343 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2344 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002345 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002346 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002347}
2348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002349
Victor Stinnerbc603d12011-10-02 01:00:40 +02002350/* Widen Unicode objects to larger buffers. Don't write terminating null
2351 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002352
2353void*
2354_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2355{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002356 Py_ssize_t len;
2357 void *result;
2358 unsigned int skind;
2359
Benjamin Petersonbac79492012-01-14 13:34:47 -05002360 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002361 return NULL;
2362
2363 len = PyUnicode_GET_LENGTH(s);
2364 skind = PyUnicode_KIND(s);
2365 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002366 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002367 return NULL;
2368 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002369 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002370 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002371 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002372 if (!result)
2373 return PyErr_NoMemory();
2374 assert(skind == PyUnicode_1BYTE_KIND);
2375 _PyUnicode_CONVERT_BYTES(
2376 Py_UCS1, Py_UCS2,
2377 PyUnicode_1BYTE_DATA(s),
2378 PyUnicode_1BYTE_DATA(s) + len,
2379 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002380 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002381 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002382 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002383 if (!result)
2384 return PyErr_NoMemory();
2385 if (skind == PyUnicode_2BYTE_KIND) {
2386 _PyUnicode_CONVERT_BYTES(
2387 Py_UCS2, Py_UCS4,
2388 PyUnicode_2BYTE_DATA(s),
2389 PyUnicode_2BYTE_DATA(s) + len,
2390 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002392 else {
2393 assert(skind == PyUnicode_1BYTE_KIND);
2394 _PyUnicode_CONVERT_BYTES(
2395 Py_UCS1, Py_UCS4,
2396 PyUnicode_1BYTE_DATA(s),
2397 PyUnicode_1BYTE_DATA(s) + len,
2398 result);
2399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002401 default:
2402 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403 }
Victor Stinner01698042011-10-04 00:04:26 +02002404 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002405 return NULL;
2406}
2407
2408static Py_UCS4*
2409as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2410 int copy_null)
2411{
2412 int kind;
2413 void *data;
2414 Py_ssize_t len, targetlen;
2415 if (PyUnicode_READY(string) == -1)
2416 return NULL;
2417 kind = PyUnicode_KIND(string);
2418 data = PyUnicode_DATA(string);
2419 len = PyUnicode_GET_LENGTH(string);
2420 targetlen = len;
2421 if (copy_null)
2422 targetlen++;
2423 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002424 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002425 if (!target) {
2426 PyErr_NoMemory();
2427 return NULL;
2428 }
2429 }
2430 else {
2431 if (targetsize < targetlen) {
2432 PyErr_Format(PyExc_SystemError,
2433 "string is longer than the buffer");
2434 if (copy_null && 0 < targetsize)
2435 target[0] = 0;
2436 return NULL;
2437 }
2438 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002439 if (kind == PyUnicode_1BYTE_KIND) {
2440 Py_UCS1 *start = (Py_UCS1 *) data;
2441 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002442 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002443 else if (kind == PyUnicode_2BYTE_KIND) {
2444 Py_UCS2 *start = (Py_UCS2 *) data;
2445 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2446 }
2447 else {
2448 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002450 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 if (copy_null)
2452 target[len] = 0;
2453 return target;
2454}
2455
2456Py_UCS4*
2457PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2458 int copy_null)
2459{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002460 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 PyErr_BadInternalCall();
2462 return NULL;
2463 }
2464 return as_ucs4(string, target, targetsize, copy_null);
2465}
2466
2467Py_UCS4*
2468PyUnicode_AsUCS4Copy(PyObject *string)
2469{
2470 return as_ucs4(string, NULL, 0, 1);
2471}
2472
2473#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002474
Alexander Belopolsky40018472011-02-26 01:02:56 +00002475PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002476PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002477{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002480 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002481 PyErr_BadInternalCall();
2482 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483 }
2484
Martin v. Löwis790465f2008-04-05 20:41:37 +00002485 if (size == -1) {
2486 size = wcslen(w);
2487 }
2488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002489 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490}
2491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002492#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002493
Victor Stinner15a11362012-10-06 23:48:20 +02002494/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002495 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2496 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2497#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002498
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002499static int
2500unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2501 Py_ssize_t width, Py_ssize_t precision)
2502{
2503 Py_ssize_t length, fill, arglen;
2504 Py_UCS4 maxchar;
2505
2506 if (PyUnicode_READY(str) == -1)
2507 return -1;
2508
2509 length = PyUnicode_GET_LENGTH(str);
2510 if ((precision == -1 || precision >= length)
2511 && width <= length)
2512 return _PyUnicodeWriter_WriteStr(writer, str);
2513
2514 if (precision != -1)
2515 length = Py_MIN(precision, length);
2516
2517 arglen = Py_MAX(length, width);
2518 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2519 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2520 else
2521 maxchar = writer->maxchar;
2522
2523 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2524 return -1;
2525
2526 if (width > length) {
2527 fill = width - length;
2528 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2529 return -1;
2530 writer->pos += fill;
2531 }
2532
2533 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2534 str, 0, length);
2535 writer->pos += length;
2536 return 0;
2537}
2538
2539static int
2540unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2541 Py_ssize_t width, Py_ssize_t precision)
2542{
2543 /* UTF-8 */
2544 Py_ssize_t length;
2545 PyObject *unicode;
2546 int res;
2547
2548 length = strlen(str);
2549 if (precision != -1)
2550 length = Py_MIN(length, precision);
2551 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2552 if (unicode == NULL)
2553 return -1;
2554
2555 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2556 Py_DECREF(unicode);
2557 return res;
2558}
2559
Victor Stinner96865452011-03-01 23:44:09 +00002560static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002561unicode_fromformat_arg(_PyUnicodeWriter *writer,
2562 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002563{
Victor Stinnere215d962012-10-06 23:03:36 +02002564 const char *p;
2565 Py_ssize_t len;
2566 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002567 Py_ssize_t width;
2568 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002569 int longflag;
2570 int longlongflag;
2571 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002572 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002573
2574 p = f;
2575 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002576 zeropad = 0;
2577 if (*f == '0') {
2578 zeropad = 1;
2579 f++;
2580 }
Victor Stinner96865452011-03-01 23:44:09 +00002581
2582 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002583 width = -1;
2584 if (Py_ISDIGIT((unsigned)*f)) {
2585 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002586 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002587 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002588 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002589 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002591 return NULL;
2592 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002593 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002594 f++;
2595 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002596 }
2597 precision = -1;
2598 if (*f == '.') {
2599 f++;
2600 if (Py_ISDIGIT((unsigned)*f)) {
2601 precision = (*f - '0');
2602 f++;
2603 while (Py_ISDIGIT((unsigned)*f)) {
2604 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2605 PyErr_SetString(PyExc_ValueError,
2606 "precision too big");
2607 return NULL;
2608 }
2609 precision = (precision * 10) + (*f - '0');
2610 f++;
2611 }
2612 }
Victor Stinner96865452011-03-01 23:44:09 +00002613 if (*f == '%') {
2614 /* "%.3%s" => f points to "3" */
2615 f--;
2616 }
2617 }
2618 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002619 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002620 f--;
2621 }
Victor Stinner96865452011-03-01 23:44:09 +00002622
2623 /* Handle %ld, %lu, %lld and %llu. */
2624 longflag = 0;
2625 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002626 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002627 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002628 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002629 longflag = 1;
2630 ++f;
2631 }
2632#ifdef HAVE_LONG_LONG
2633 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002634 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002635 longlongflag = 1;
2636 f += 2;
2637 }
2638#endif
2639 }
2640 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002641 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002642 size_tflag = 1;
2643 ++f;
2644 }
Victor Stinnere215d962012-10-06 23:03:36 +02002645
2646 if (f[1] == '\0')
2647 writer->overallocate = 0;
2648
2649 switch (*f) {
2650 case 'c':
2651 {
2652 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002653 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002654 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002655 "character argument not in range(0x110000)");
2656 return NULL;
2657 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002658 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002659 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002660 break;
2661 }
2662
2663 case 'i':
2664 case 'd':
2665 case 'u':
2666 case 'x':
2667 {
2668 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002669 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002670 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002671
2672 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002673 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002674 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002675 va_arg(*vargs, unsigned long));
2676#ifdef HAVE_LONG_LONG
2677 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002678 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002679 va_arg(*vargs, unsigned PY_LONG_LONG));
2680#endif
2681 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002682 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002683 va_arg(*vargs, size_t));
2684 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002685 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002686 va_arg(*vargs, unsigned int));
2687 }
2688 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002689 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002690 }
2691 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002692 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002693 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002694 va_arg(*vargs, long));
2695#ifdef HAVE_LONG_LONG
2696 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002697 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002698 va_arg(*vargs, PY_LONG_LONG));
2699#endif
2700 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002701 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002702 va_arg(*vargs, Py_ssize_t));
2703 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002704 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002705 va_arg(*vargs, int));
2706 }
2707 assert(len >= 0);
2708
Victor Stinnere215d962012-10-06 23:03:36 +02002709 if (precision < len)
2710 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002711
2712 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002713 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2714 return NULL;
2715
Victor Stinnere215d962012-10-06 23:03:36 +02002716 if (width > precision) {
2717 Py_UCS4 fillchar;
2718 fill = width - precision;
2719 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002720 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2721 return NULL;
2722 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002723 }
Victor Stinner15a11362012-10-06 23:48:20 +02002724 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002725 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002726 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2727 return NULL;
2728 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002729 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002730
Victor Stinner4a587072013-11-19 12:54:53 +01002731 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2732 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002733 break;
2734 }
2735
2736 case 'p':
2737 {
2738 char number[MAX_LONG_LONG_CHARS];
2739
2740 len = sprintf(number, "%p", va_arg(*vargs, void*));
2741 assert(len >= 0);
2742
2743 /* %p is ill-defined: ensure leading 0x. */
2744 if (number[1] == 'X')
2745 number[1] = 'x';
2746 else if (number[1] != 'x') {
2747 memmove(number + 2, number,
2748 strlen(number) + 1);
2749 number[0] = '0';
2750 number[1] = 'x';
2751 len += 2;
2752 }
2753
Victor Stinner4a587072013-11-19 12:54:53 +01002754 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002755 return NULL;
2756 break;
2757 }
2758
2759 case 's':
2760 {
2761 /* UTF-8 */
2762 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002763 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002764 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002765 break;
2766 }
2767
2768 case 'U':
2769 {
2770 PyObject *obj = va_arg(*vargs, PyObject *);
2771 assert(obj && _PyUnicode_CHECK(obj));
2772
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002773 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002774 return NULL;
2775 break;
2776 }
2777
2778 case 'V':
2779 {
2780 PyObject *obj = va_arg(*vargs, PyObject *);
2781 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002782 if (obj) {
2783 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002784 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002785 return NULL;
2786 }
2787 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002788 assert(str != NULL);
2789 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002790 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002791 }
2792 break;
2793 }
2794
2795 case 'S':
2796 {
2797 PyObject *obj = va_arg(*vargs, PyObject *);
2798 PyObject *str;
2799 assert(obj);
2800 str = PyObject_Str(obj);
2801 if (!str)
2802 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002803 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002804 Py_DECREF(str);
2805 return NULL;
2806 }
2807 Py_DECREF(str);
2808 break;
2809 }
2810
2811 case 'R':
2812 {
2813 PyObject *obj = va_arg(*vargs, PyObject *);
2814 PyObject *repr;
2815 assert(obj);
2816 repr = PyObject_Repr(obj);
2817 if (!repr)
2818 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002819 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002820 Py_DECREF(repr);
2821 return NULL;
2822 }
2823 Py_DECREF(repr);
2824 break;
2825 }
2826
2827 case 'A':
2828 {
2829 PyObject *obj = va_arg(*vargs, PyObject *);
2830 PyObject *ascii;
2831 assert(obj);
2832 ascii = PyObject_ASCII(obj);
2833 if (!ascii)
2834 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002835 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002836 Py_DECREF(ascii);
2837 return NULL;
2838 }
2839 Py_DECREF(ascii);
2840 break;
2841 }
2842
2843 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002844 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002845 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002846 break;
2847
2848 default:
2849 /* if we stumble upon an unknown formatting code, copy the rest
2850 of the format string to the output string. (we cannot just
2851 skip the code, since there's no way to know what's in the
2852 argument list) */
2853 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002854 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002855 return NULL;
2856 f = p+len;
2857 return f;
2858 }
2859
2860 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002861 return f;
2862}
2863
Walter Dörwaldd2034312007-05-18 16:29:38 +00002864PyObject *
2865PyUnicode_FromFormatV(const char *format, va_list vargs)
2866{
Victor Stinnere215d962012-10-06 23:03:36 +02002867 va_list vargs2;
2868 const char *f;
2869 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002870
Victor Stinner8f674cc2013-04-17 23:02:17 +02002871 _PyUnicodeWriter_Init(&writer);
2872 writer.min_length = strlen(format) + 100;
2873 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002874
2875 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2876 Copy it to be able to pass a reference to a subfunction. */
2877 Py_VA_COPY(vargs2, vargs);
2878
2879 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002880 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002881 f = unicode_fromformat_arg(&writer, f, &vargs2);
2882 if (f == NULL)
2883 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002884 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002885 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002886 const char *p;
2887 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002888
Victor Stinnere215d962012-10-06 23:03:36 +02002889 p = f;
2890 do
2891 {
2892 if ((unsigned char)*p > 127) {
2893 PyErr_Format(PyExc_ValueError,
2894 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2895 "string, got a non-ASCII byte: 0x%02x",
2896 (unsigned char)*p);
2897 return NULL;
2898 }
2899 p++;
2900 }
2901 while (*p != '\0' && *p != '%');
2902 len = p - f;
2903
2904 if (*p == '\0')
2905 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002906
2907 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002908 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002909
2910 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002911 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002912 }
Victor Stinnere215d962012-10-06 23:03:36 +02002913 return _PyUnicodeWriter_Finish(&writer);
2914
2915 fail:
2916 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002917 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002918}
2919
Walter Dörwaldd2034312007-05-18 16:29:38 +00002920PyObject *
2921PyUnicode_FromFormat(const char *format, ...)
2922{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002923 PyObject* ret;
2924 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002925
2926#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002927 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002928#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002929 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002930#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002931 ret = PyUnicode_FromFormatV(format, vargs);
2932 va_end(vargs);
2933 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002934}
2935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002936#ifdef HAVE_WCHAR_H
2937
Victor Stinner5593d8a2010-10-02 11:11:27 +00002938/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2939 convert a Unicode object to a wide character string.
2940
Victor Stinnerd88d9832011-09-06 02:00:05 +02002941 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002942 character) required to convert the unicode object. Ignore size argument.
2943
Victor Stinnerd88d9832011-09-06 02:00:05 +02002944 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002945 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002946 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002947static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002948unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002949 wchar_t *w,
2950 Py_ssize_t size)
2951{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002952 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002953 const wchar_t *wstr;
2954
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002955 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002956 if (wstr == NULL)
2957 return -1;
2958
Victor Stinner5593d8a2010-10-02 11:11:27 +00002959 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960 if (size > res)
2961 size = res + 1;
2962 else
2963 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002964 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002965 return res;
2966 }
2967 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002968 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002969}
2970
2971Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002972PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002973 wchar_t *w,
2974 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975{
2976 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002977 PyErr_BadInternalCall();
2978 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002980 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981}
2982
Victor Stinner137c34c2010-09-29 10:25:54 +00002983wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002984PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002985 Py_ssize_t *size)
2986{
2987 wchar_t* buffer;
2988 Py_ssize_t buflen;
2989
2990 if (unicode == NULL) {
2991 PyErr_BadInternalCall();
2992 return NULL;
2993 }
2994
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002995 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002996 if (buflen == -1)
2997 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002998 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00002999 if (buffer == NULL) {
3000 PyErr_NoMemory();
3001 return NULL;
3002 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003003 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003004 if (buflen == -1) {
3005 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003006 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003007 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003008 if (size != NULL)
3009 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003010 return buffer;
3011}
3012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003013#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014
Alexander Belopolsky40018472011-02-26 01:02:56 +00003015PyObject *
3016PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003017{
Victor Stinner8faf8212011-12-08 22:14:11 +01003018 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003019 PyErr_SetString(PyExc_ValueError,
3020 "chr() arg not in range(0x110000)");
3021 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003022 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003023
Victor Stinner985a82a2014-01-03 12:53:47 +01003024 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003025}
3026
Alexander Belopolsky40018472011-02-26 01:02:56 +00003027PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003028PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003030 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003031 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003032 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003033 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003034 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003035 Py_INCREF(obj);
3036 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003037 }
3038 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003039 /* For a Unicode subtype that's not a Unicode object,
3040 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003041 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003042 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003043 PyErr_Format(PyExc_TypeError,
3044 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003045 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003046 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003047}
3048
Alexander Belopolsky40018472011-02-26 01:02:56 +00003049PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003050PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003051 const char *encoding,
3052 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003053{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003054 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003055 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003056
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003058 PyErr_BadInternalCall();
3059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003061
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003062 /* Decoding bytes objects is the most common case and should be fast */
3063 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003064 if (PyBytes_GET_SIZE(obj) == 0)
3065 _Py_RETURN_UNICODE_EMPTY();
3066 v = PyUnicode_Decode(
3067 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3068 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003069 return v;
3070 }
3071
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003072 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003073 PyErr_SetString(PyExc_TypeError,
3074 "decoding str is not supported");
3075 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003076 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003077
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003078 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3079 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3080 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003081 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003082 Py_TYPE(obj)->tp_name);
3083 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003084 }
Tim Petersced69f82003-09-16 20:30:58 +00003085
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003086 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003087 PyBuffer_Release(&buffer);
3088 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003090
Serhiy Storchaka05997252013-01-26 12:14:02 +02003091 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003092 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003093 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003094}
3095
Victor Stinner600d3be2010-06-10 12:00:55 +00003096/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003097 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3098 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003099int
3100_Py_normalize_encoding(const char *encoding,
3101 char *lower,
3102 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003104 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003105 char *l;
3106 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003107
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003108 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01003109 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01003110 if (lower_len < 6)
3111 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003112 strcpy(lower, "utf-8");
3113 return 1;
3114 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003115 e = encoding;
3116 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003117 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003118 while (*e) {
3119 if (l == l_end)
3120 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003121 if (Py_ISUPPER(*e)) {
3122 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003123 }
3124 else if (*e == '_') {
3125 *l++ = '-';
3126 e++;
3127 }
3128 else {
3129 *l++ = *e++;
3130 }
3131 }
3132 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003133 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003134}
3135
Alexander Belopolsky40018472011-02-26 01:02:56 +00003136PyObject *
3137PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003138 Py_ssize_t size,
3139 const char *encoding,
3140 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003141{
3142 PyObject *buffer = NULL, *unicode;
3143 Py_buffer info;
3144 char lower[11]; /* Enough for any encoding shortcut */
3145
Fred Drakee4315f52000-05-09 19:53:39 +00003146 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003147 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003148 if ((strcmp(lower, "utf-8") == 0) ||
3149 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003150 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003151 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003152 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003153 (strcmp(lower, "iso-8859-1") == 0) ||
3154 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003155 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003156#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003157 else if (strcmp(lower, "mbcs") == 0)
3158 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003159#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003160 else if (strcmp(lower, "ascii") == 0)
3161 return PyUnicode_DecodeASCII(s, size, errors);
3162 else if (strcmp(lower, "utf-16") == 0)
3163 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3164 else if (strcmp(lower, "utf-32") == 0)
3165 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167
3168 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003169 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003170 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003171 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003172 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 if (buffer == NULL)
3174 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003175 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 if (unicode == NULL)
3177 goto onError;
3178 if (!PyUnicode_Check(unicode)) {
3179 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003180 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3181 "use codecs.decode() to decode to arbitrary types",
3182 encoding,
3183 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003184 Py_DECREF(unicode);
3185 goto onError;
3186 }
3187 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003188 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003189
Benjamin Peterson29060642009-01-31 22:14:21 +00003190 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191 Py_XDECREF(buffer);
3192 return NULL;
3193}
3194
Alexander Belopolsky40018472011-02-26 01:02:56 +00003195PyObject *
3196PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003197 const char *encoding,
3198 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003199{
3200 PyObject *v;
3201
3202 if (!PyUnicode_Check(unicode)) {
3203 PyErr_BadArgument();
3204 goto onError;
3205 }
3206
3207 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003208 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003209
3210 /* Decode via the codec registry */
3211 v = PyCodec_Decode(unicode, encoding, errors);
3212 if (v == NULL)
3213 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003214 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003215
Benjamin Peterson29060642009-01-31 22:14:21 +00003216 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003217 return NULL;
3218}
3219
Alexander Belopolsky40018472011-02-26 01:02:56 +00003220PyObject *
3221PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003222 const char *encoding,
3223 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003224{
3225 PyObject *v;
3226
3227 if (!PyUnicode_Check(unicode)) {
3228 PyErr_BadArgument();
3229 goto onError;
3230 }
3231
3232 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003234
3235 /* Decode via the codec registry */
3236 v = PyCodec_Decode(unicode, encoding, errors);
3237 if (v == NULL)
3238 goto onError;
3239 if (!PyUnicode_Check(v)) {
3240 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003241 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3242 "use codecs.decode() to decode to arbitrary types",
3243 encoding,
3244 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003245 Py_DECREF(v);
3246 goto onError;
3247 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003248 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003249
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003251 return NULL;
3252}
3253
Alexander Belopolsky40018472011-02-26 01:02:56 +00003254PyObject *
3255PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003256 Py_ssize_t size,
3257 const char *encoding,
3258 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259{
3260 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003261
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 unicode = PyUnicode_FromUnicode(s, size);
3263 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003264 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3266 Py_DECREF(unicode);
3267 return v;
3268}
3269
Alexander Belopolsky40018472011-02-26 01:02:56 +00003270PyObject *
3271PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003272 const char *encoding,
3273 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003274{
3275 PyObject *v;
3276
3277 if (!PyUnicode_Check(unicode)) {
3278 PyErr_BadArgument();
3279 goto onError;
3280 }
3281
3282 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003283 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003284
3285 /* Encode via the codec registry */
3286 v = PyCodec_Encode(unicode, encoding, errors);
3287 if (v == NULL)
3288 goto onError;
3289 return v;
3290
Benjamin Peterson29060642009-01-31 22:14:21 +00003291 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003292 return NULL;
3293}
3294
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003295static size_t
3296wcstombs_errorpos(const wchar_t *wstr)
3297{
3298 size_t len;
3299#if SIZEOF_WCHAR_T == 2
3300 wchar_t buf[3];
3301#else
3302 wchar_t buf[2];
3303#endif
3304 char outbuf[MB_LEN_MAX];
3305 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003306
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003307#if SIZEOF_WCHAR_T == 2
3308 buf[2] = 0;
3309#else
3310 buf[1] = 0;
3311#endif
3312 start = wstr;
3313 while (*wstr != L'\0')
3314 {
3315 previous = wstr;
3316#if SIZEOF_WCHAR_T == 2
3317 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3318 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3319 {
3320 buf[0] = wstr[0];
3321 buf[1] = wstr[1];
3322 wstr += 2;
3323 }
3324 else {
3325 buf[0] = *wstr;
3326 buf[1] = 0;
3327 wstr++;
3328 }
3329#else
3330 buf[0] = *wstr;
3331 wstr++;
3332#endif
3333 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003334 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003335 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003336 }
3337
3338 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003339 return 0;
3340}
3341
Victor Stinner1b579672011-12-17 05:47:23 +01003342static int
3343locale_error_handler(const char *errors, int *surrogateescape)
3344{
Victor Stinner50149202015-09-22 00:26:54 +02003345 _Py_error_handler error_handler = get_error_handler(errors);
3346 switch (error_handler)
3347 {
3348 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003349 *surrogateescape = 0;
3350 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003351 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003352 *surrogateescape = 1;
3353 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003354 default:
3355 PyErr_Format(PyExc_ValueError,
3356 "only 'strict' and 'surrogateescape' error handlers "
3357 "are supported, not '%s'",
3358 errors);
3359 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003360 }
Victor Stinner1b579672011-12-17 05:47:23 +01003361}
3362
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003363PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003364PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003365{
3366 Py_ssize_t wlen, wlen2;
3367 wchar_t *wstr;
3368 PyObject *bytes = NULL;
3369 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003370 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003371 PyObject *exc;
3372 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003373 int surrogateescape;
3374
3375 if (locale_error_handler(errors, &surrogateescape) < 0)
3376 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003377
3378 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3379 if (wstr == NULL)
3380 return NULL;
3381
3382 wlen2 = wcslen(wstr);
3383 if (wlen2 != wlen) {
3384 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003385 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003386 return NULL;
3387 }
3388
3389 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003390 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003391 char *str;
3392
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003393 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003394 if (str == NULL) {
3395 if (error_pos == (size_t)-1) {
3396 PyErr_NoMemory();
3397 PyMem_Free(wstr);
3398 return NULL;
3399 }
3400 else {
3401 goto encode_error;
3402 }
3403 }
3404 PyMem_Free(wstr);
3405
3406 bytes = PyBytes_FromString(str);
3407 PyMem_Free(str);
3408 }
3409 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003410 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003411 size_t len, len2;
3412
3413 len = wcstombs(NULL, wstr, 0);
3414 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003415 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003416 goto encode_error;
3417 }
3418
3419 bytes = PyBytes_FromStringAndSize(NULL, len);
3420 if (bytes == NULL) {
3421 PyMem_Free(wstr);
3422 return NULL;
3423 }
3424
3425 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3426 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003427 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003428 goto encode_error;
3429 }
3430 PyMem_Free(wstr);
3431 }
3432 return bytes;
3433
3434encode_error:
3435 errmsg = strerror(errno);
3436 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003437
3438 if (error_pos == (size_t)-1)
3439 error_pos = wcstombs_errorpos(wstr);
3440
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003441 PyMem_Free(wstr);
3442 Py_XDECREF(bytes);
3443
Victor Stinner2f197072011-12-17 07:08:30 +01003444 if (errmsg != NULL) {
3445 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003446 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003447 if (wstr != NULL) {
3448 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003449 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003450 } else
3451 errmsg = NULL;
3452 }
3453 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003454 reason = PyUnicode_FromString(
3455 "wcstombs() encountered an unencodable "
3456 "wide character");
3457 if (reason == NULL)
3458 return NULL;
3459
3460 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3461 "locale", unicode,
3462 (Py_ssize_t)error_pos,
3463 (Py_ssize_t)(error_pos+1),
3464 reason);
3465 Py_DECREF(reason);
3466 if (exc != NULL) {
3467 PyCodec_StrictErrors(exc);
3468 Py_XDECREF(exc);
3469 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003470 return NULL;
3471}
3472
Victor Stinnerad158722010-10-27 00:25:46 +00003473PyObject *
3474PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003475{
Victor Stinner99b95382011-07-04 14:23:54 +02003476#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003477 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003478#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003479 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003480#else
Victor Stinner793b5312011-04-27 00:24:21 +02003481 PyInterpreterState *interp = PyThreadState_GET()->interp;
3482 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3483 cannot use it to encode and decode filenames before it is loaded. Load
3484 the Python codec requires to encode at least its own filename. Use the C
3485 version of the locale codec until the codec registry is initialized and
3486 the Python codec is loaded.
3487
3488 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3489 cannot only rely on it: check also interp->fscodec_initialized for
3490 subinterpreters. */
3491 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003492 return PyUnicode_AsEncodedString(unicode,
3493 Py_FileSystemDefaultEncoding,
3494 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003495 }
3496 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003497 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003498 }
Victor Stinnerad158722010-10-27 00:25:46 +00003499#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003500}
3501
Alexander Belopolsky40018472011-02-26 01:02:56 +00003502PyObject *
3503PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003504 const char *encoding,
3505 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003506{
3507 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003508 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003509
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510 if (!PyUnicode_Check(unicode)) {
3511 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003512 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513 }
Fred Drakee4315f52000-05-09 19:53:39 +00003514
Fred Drakee4315f52000-05-09 19:53:39 +00003515 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003516 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003517 if ((strcmp(lower, "utf-8") == 0) ||
3518 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003519 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003520 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003521 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003522 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003523 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003524 }
Victor Stinner37296e82010-06-10 13:36:23 +00003525 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003526 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003527 (strcmp(lower, "iso-8859-1") == 0) ||
3528 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003529 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003530#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003531 else if (strcmp(lower, "mbcs") == 0)
3532 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003533#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003534 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003535 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537
3538 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003539 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003541 return NULL;
3542
3543 /* The normal path */
3544 if (PyBytes_Check(v))
3545 return v;
3546
3547 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003548 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003549 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003550 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003551
3552 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003553 "encoder %s returned bytearray instead of bytes; "
3554 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003555 encoding);
3556 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003557 Py_DECREF(v);
3558 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003559 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003560
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003561 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3562 Py_DECREF(v);
3563 return b;
3564 }
3565
3566 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003567 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3568 "use codecs.encode() to encode to arbitrary types",
3569 encoding,
3570 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003571 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003572 return NULL;
3573}
3574
Alexander Belopolsky40018472011-02-26 01:02:56 +00003575PyObject *
3576PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003577 const char *encoding,
3578 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003579{
3580 PyObject *v;
3581
3582 if (!PyUnicode_Check(unicode)) {
3583 PyErr_BadArgument();
3584 goto onError;
3585 }
3586
3587 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003588 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003589
3590 /* Encode via the codec registry */
3591 v = PyCodec_Encode(unicode, encoding, errors);
3592 if (v == NULL)
3593 goto onError;
3594 if (!PyUnicode_Check(v)) {
3595 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003596 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3597 "use codecs.encode() to encode to arbitrary types",
3598 encoding,
3599 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003600 Py_DECREF(v);
3601 goto onError;
3602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003604
Benjamin Peterson29060642009-01-31 22:14:21 +00003605 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003606 return NULL;
3607}
3608
Victor Stinner2f197072011-12-17 07:08:30 +01003609static size_t
3610mbstowcs_errorpos(const char *str, size_t len)
3611{
3612#ifdef HAVE_MBRTOWC
3613 const char *start = str;
3614 mbstate_t mbs;
3615 size_t converted;
3616 wchar_t ch;
3617
3618 memset(&mbs, 0, sizeof mbs);
3619 while (len)
3620 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003621 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003622 if (converted == 0)
3623 /* Reached end of string */
3624 break;
3625 if (converted == (size_t)-1 || converted == (size_t)-2) {
3626 /* Conversion error or incomplete character */
3627 return str - start;
3628 }
3629 else {
3630 str += converted;
3631 len -= converted;
3632 }
3633 }
3634 /* failed to find the undecodable byte sequence */
3635 return 0;
3636#endif
3637 return 0;
3638}
3639
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003640PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003641PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003642 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003643{
3644 wchar_t smallbuf[256];
3645 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3646 wchar_t *wstr;
3647 size_t wlen, wlen2;
3648 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003649 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003650 size_t error_pos;
3651 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003652 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3653 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003654
3655 if (locale_error_handler(errors, &surrogateescape) < 0)
3656 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003657
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003658 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3659 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003660 return NULL;
3661 }
3662
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003663 if (surrogateescape) {
3664 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003665 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003666 if (wstr == NULL) {
3667 if (wlen == (size_t)-1)
3668 PyErr_NoMemory();
3669 else
3670 PyErr_SetFromErrno(PyExc_OSError);
3671 return NULL;
3672 }
3673
3674 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003675 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003676 }
3677 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003678 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003679#ifndef HAVE_BROKEN_MBSTOWCS
3680 wlen = mbstowcs(NULL, str, 0);
3681#else
3682 wlen = len;
3683#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003684 if (wlen == (size_t)-1)
3685 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003686 if (wlen+1 <= smallbuf_len) {
3687 wstr = smallbuf;
3688 }
3689 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003690 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003691 if (!wstr)
3692 return PyErr_NoMemory();
3693 }
3694
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003695 wlen2 = mbstowcs(wstr, str, wlen+1);
3696 if (wlen2 == (size_t)-1) {
3697 if (wstr != smallbuf)
3698 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003699 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003700 }
3701#ifdef HAVE_BROKEN_MBSTOWCS
3702 assert(wlen2 == wlen);
3703#endif
3704 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3705 if (wstr != smallbuf)
3706 PyMem_Free(wstr);
3707 }
3708 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003709
3710decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003711 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003712 errmsg = strerror(errno);
3713 assert(errmsg != NULL);
3714
3715 error_pos = mbstowcs_errorpos(str, len);
3716 if (errmsg != NULL) {
3717 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003718 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003719 if (wstr != NULL) {
3720 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003721 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003722 }
Victor Stinner2f197072011-12-17 07:08:30 +01003723 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003724 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003725 reason = PyUnicode_FromString(
3726 "mbstowcs() encountered an invalid multibyte sequence");
3727 if (reason == NULL)
3728 return NULL;
3729
3730 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3731 "locale", str, len,
3732 (Py_ssize_t)error_pos,
3733 (Py_ssize_t)(error_pos+1),
3734 reason);
3735 Py_DECREF(reason);
3736 if (exc != NULL) {
3737 PyCodec_StrictErrors(exc);
3738 Py_XDECREF(exc);
3739 }
3740 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003741}
3742
3743PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003744PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003745{
3746 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003747 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003748}
3749
3750
3751PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003752PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003753 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003754 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3755}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003756
Christian Heimes5894ba72007-11-04 11:43:14 +00003757PyObject*
3758PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3759{
Victor Stinner99b95382011-07-04 14:23:54 +02003760#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003761 return PyUnicode_DecodeMBCS(s, size, NULL);
3762#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003763 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003764#else
Victor Stinner793b5312011-04-27 00:24:21 +02003765 PyInterpreterState *interp = PyThreadState_GET()->interp;
3766 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3767 cannot use it to encode and decode filenames before it is loaded. Load
3768 the Python codec requires to encode at least its own filename. Use the C
3769 version of the locale codec until the codec registry is initialized and
3770 the Python codec is loaded.
3771
3772 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3773 cannot only rely on it: check also interp->fscodec_initialized for
3774 subinterpreters. */
3775 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003776 return PyUnicode_Decode(s, size,
3777 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003778 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003779 }
3780 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003781 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003782 }
Victor Stinnerad158722010-10-27 00:25:46 +00003783#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003784}
3785
Martin v. Löwis011e8422009-05-05 04:43:17 +00003786
3787int
3788PyUnicode_FSConverter(PyObject* arg, void* addr)
3789{
3790 PyObject *output = NULL;
3791 Py_ssize_t size;
3792 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003793 if (arg == NULL) {
3794 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003795 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003796 return 1;
3797 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003798 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003799 output = arg;
3800 Py_INCREF(output);
3801 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003802 else if (PyUnicode_Check(arg)) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003803 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003804 if (!output)
3805 return 0;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003806 assert(PyBytes_Check(output));
3807 }
3808 else {
3809 PyErr_Format(PyExc_TypeError,
3810 "must be str or bytes, not %.100s",
3811 Py_TYPE(arg)->tp_name);
3812 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003813 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003814 size = PyBytes_GET_SIZE(output);
3815 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003816 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003817 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003818 Py_DECREF(output);
3819 return 0;
3820 }
3821 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003822 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003823}
3824
3825
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003826int
3827PyUnicode_FSDecoder(PyObject* arg, void* addr)
3828{
3829 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003830 if (arg == NULL) {
3831 Py_DECREF(*(PyObject**)addr);
3832 return 1;
3833 }
3834 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003835 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003837 output = arg;
3838 Py_INCREF(output);
3839 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003840 else if (PyObject_CheckBuffer(arg)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003841 arg = PyBytes_FromObject(arg);
3842 if (!arg)
3843 return 0;
3844 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3845 PyBytes_GET_SIZE(arg));
3846 Py_DECREF(arg);
3847 if (!output)
3848 return 0;
3849 if (!PyUnicode_Check(output)) {
3850 Py_DECREF(output);
3851 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3852 return 0;
3853 }
3854 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003855 else {
3856 PyErr_Format(PyExc_TypeError,
3857 "path should be string or bytes, not %.200s",
3858 Py_TYPE(arg)->tp_name);
3859 return 0;
3860 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003861 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003862 Py_DECREF(output);
3863 return 0;
3864 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003865 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003866 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003867 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003868 Py_DECREF(output);
3869 return 0;
3870 }
3871 *(PyObject**)addr = output;
3872 return Py_CLEANUP_SUPPORTED;
3873}
3874
3875
Martin v. Löwis5b222132007-06-10 09:51:05 +00003876char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003878{
Christian Heimesf3863112007-11-22 07:46:41 +00003879 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003880
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003881 if (!PyUnicode_Check(unicode)) {
3882 PyErr_BadArgument();
3883 return NULL;
3884 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003885 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003886 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003887
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003888 if (PyUnicode_UTF8(unicode) == NULL) {
3889 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003890 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003891 if (bytes == NULL)
3892 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003893 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3894 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003895 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003896 Py_DECREF(bytes);
3897 return NULL;
3898 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003899 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3900 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3901 PyBytes_AS_STRING(bytes),
3902 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003903 Py_DECREF(bytes);
3904 }
3905
3906 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003907 *psize = PyUnicode_UTF8_LENGTH(unicode);
3908 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003909}
3910
3911char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003912PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003914 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3915}
3916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003917Py_UNICODE *
3918PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3919{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920 const unsigned char *one_byte;
3921#if SIZEOF_WCHAR_T == 4
3922 const Py_UCS2 *two_bytes;
3923#else
3924 const Py_UCS4 *four_bytes;
3925 const Py_UCS4 *ucs4_end;
3926 Py_ssize_t num_surrogates;
3927#endif
3928 wchar_t *w;
3929 wchar_t *wchar_end;
3930
3931 if (!PyUnicode_Check(unicode)) {
3932 PyErr_BadArgument();
3933 return NULL;
3934 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003935 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003936 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003937 assert(_PyUnicode_KIND(unicode) != 0);
3938 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003939
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003940 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003941#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003942 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3943 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944 num_surrogates = 0;
3945
3946 for (; four_bytes < ucs4_end; ++four_bytes) {
3947 if (*four_bytes > 0xFFFF)
3948 ++num_surrogates;
3949 }
3950
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003951 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3952 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3953 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003954 PyErr_NoMemory();
3955 return NULL;
3956 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003957 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003958
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003959 w = _PyUnicode_WSTR(unicode);
3960 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3961 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003962 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3963 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003964 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003965 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003966 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3967 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003968 }
3969 else
3970 *w = *four_bytes;
3971
3972 if (w > wchar_end) {
3973 assert(0 && "Miscalculated string end");
3974 }
3975 }
3976 *w = 0;
3977#else
3978 /* sizeof(wchar_t) == 4 */
3979 Py_FatalError("Impossible unicode object state, wstr and str "
3980 "should share memory already.");
3981 return NULL;
3982#endif
3983 }
3984 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003985 if ((size_t)_PyUnicode_LENGTH(unicode) >
3986 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3987 PyErr_NoMemory();
3988 return NULL;
3989 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003990 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3991 (_PyUnicode_LENGTH(unicode) + 1));
3992 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003993 PyErr_NoMemory();
3994 return NULL;
3995 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003996 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3997 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3998 w = _PyUnicode_WSTR(unicode);
3999 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004000
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004001 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4002 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003 for (; w < wchar_end; ++one_byte, ++w)
4004 *w = *one_byte;
4005 /* null-terminate the wstr */
4006 *w = 0;
4007 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004008 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004009#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004010 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011 for (; w < wchar_end; ++two_bytes, ++w)
4012 *w = *two_bytes;
4013 /* null-terminate the wstr */
4014 *w = 0;
4015#else
4016 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004017 PyObject_FREE(_PyUnicode_WSTR(unicode));
4018 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 Py_FatalError("Impossible unicode object state, wstr "
4020 "and str should share memory already.");
4021 return NULL;
4022#endif
4023 }
4024 else {
4025 assert(0 && "This should never happen.");
4026 }
4027 }
4028 }
4029 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004030 *size = PyUnicode_WSTR_LENGTH(unicode);
4031 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004032}
4033
Alexander Belopolsky40018472011-02-26 01:02:56 +00004034Py_UNICODE *
4035PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038}
4039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040
Alexander Belopolsky40018472011-02-26 01:02:56 +00004041Py_ssize_t
4042PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004043{
4044 if (!PyUnicode_Check(unicode)) {
4045 PyErr_BadArgument();
4046 goto onError;
4047 }
4048 return PyUnicode_GET_SIZE(unicode);
4049
Benjamin Peterson29060642009-01-31 22:14:21 +00004050 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051 return -1;
4052}
4053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054Py_ssize_t
4055PyUnicode_GetLength(PyObject *unicode)
4056{
Victor Stinner07621332012-06-16 04:53:46 +02004057 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004058 PyErr_BadArgument();
4059 return -1;
4060 }
Victor Stinner07621332012-06-16 04:53:46 +02004061 if (PyUnicode_READY(unicode) == -1)
4062 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004063 return PyUnicode_GET_LENGTH(unicode);
4064}
4065
4066Py_UCS4
4067PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4068{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004069 void *data;
4070 int kind;
4071
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004072 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4073 PyErr_BadArgument();
4074 return (Py_UCS4)-1;
4075 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004076 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004077 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004078 return (Py_UCS4)-1;
4079 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004080 data = PyUnicode_DATA(unicode);
4081 kind = PyUnicode_KIND(unicode);
4082 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083}
4084
4085int
4086PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4087{
4088 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004089 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004090 return -1;
4091 }
Victor Stinner488fa492011-12-12 00:01:39 +01004092 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004093 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004094 PyErr_SetString(PyExc_IndexError, "string index out of range");
4095 return -1;
4096 }
Victor Stinner488fa492011-12-12 00:01:39 +01004097 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004098 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004099 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4100 PyErr_SetString(PyExc_ValueError, "character out of range");
4101 return -1;
4102 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004103 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4104 index, ch);
4105 return 0;
4106}
4107
Alexander Belopolsky40018472011-02-26 01:02:56 +00004108const char *
4109PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004110{
Victor Stinner42cb4622010-09-01 19:39:01 +00004111 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004112}
4113
Victor Stinner554f3f02010-06-16 23:33:54 +00004114/* create or adjust a UnicodeDecodeError */
4115static void
4116make_decode_exception(PyObject **exceptionObject,
4117 const char *encoding,
4118 const char *input, Py_ssize_t length,
4119 Py_ssize_t startpos, Py_ssize_t endpos,
4120 const char *reason)
4121{
4122 if (*exceptionObject == NULL) {
4123 *exceptionObject = PyUnicodeDecodeError_Create(
4124 encoding, input, length, startpos, endpos, reason);
4125 }
4126 else {
4127 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4128 goto onError;
4129 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4130 goto onError;
4131 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4132 goto onError;
4133 }
4134 return;
4135
4136onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004137 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004138}
4139
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004140#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141/* error handling callback helper:
4142 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004143 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004144 and adjust various state variables.
4145 return 0 on success, -1 on error
4146*/
4147
Alexander Belopolsky40018472011-02-26 01:02:56 +00004148static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004149unicode_decode_call_errorhandler_wchar(
4150 const char *errors, PyObject **errorHandler,
4151 const char *encoding, const char *reason,
4152 const char **input, const char **inend, Py_ssize_t *startinpos,
4153 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4154 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004156 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157
4158 PyObject *restuple = NULL;
4159 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004160 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004161 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004162 Py_ssize_t requiredsize;
4163 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004164 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004165 wchar_t *repwstr;
4166 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004167
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004168 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4169 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004170
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004172 *errorHandler = PyCodec_LookupError(errors);
4173 if (*errorHandler == NULL)
4174 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 }
4176
Victor Stinner554f3f02010-06-16 23:33:54 +00004177 make_decode_exception(exceptionObject,
4178 encoding,
4179 *input, *inend - *input,
4180 *startinpos, *endinpos,
4181 reason);
4182 if (*exceptionObject == NULL)
4183 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004184
4185 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4186 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004187 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004189 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004190 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 }
4192 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004193 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004194
4195 /* Copy back the bytes variables, which might have been modified by the
4196 callback */
4197 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4198 if (!inputobj)
4199 goto onError;
4200 if (!PyBytes_Check(inputobj)) {
4201 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4202 }
4203 *input = PyBytes_AS_STRING(inputobj);
4204 insize = PyBytes_GET_SIZE(inputobj);
4205 *inend = *input + insize;
4206 /* we can DECREF safely, as the exception has another reference,
4207 so the object won't go away. */
4208 Py_DECREF(inputobj);
4209
4210 if (newpos<0)
4211 newpos = insize+newpos;
4212 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004213 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004214 goto onError;
4215 }
4216
4217 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4218 if (repwstr == NULL)
4219 goto onError;
4220 /* need more space? (at least enough for what we
4221 have+the replacement+the rest of the string (starting
4222 at the new input position), so we won't have to check space
4223 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004224 requiredsize = *outpos;
4225 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4226 goto overflow;
4227 requiredsize += repwlen;
4228 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4229 goto overflow;
4230 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004231 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004232 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004233 requiredsize = 2*outsize;
4234 if (unicode_resize(output, requiredsize) < 0)
4235 goto onError;
4236 }
4237 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4238 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004239 *endinpos = newpos;
4240 *inptr = *input + newpos;
4241
4242 /* we made it! */
4243 Py_XDECREF(restuple);
4244 return 0;
4245
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004246 overflow:
4247 PyErr_SetString(PyExc_OverflowError,
4248 "decoded result is too long for a Python string");
4249
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004250 onError:
4251 Py_XDECREF(restuple);
4252 return -1;
4253}
4254#endif /* HAVE_MBCS */
4255
4256static int
4257unicode_decode_call_errorhandler_writer(
4258 const char *errors, PyObject **errorHandler,
4259 const char *encoding, const char *reason,
4260 const char **input, const char **inend, Py_ssize_t *startinpos,
4261 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4262 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4263{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004264 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004265
4266 PyObject *restuple = NULL;
4267 PyObject *repunicode = NULL;
4268 Py_ssize_t insize;
4269 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004270 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004271 PyObject *inputobj = NULL;
4272
4273 if (*errorHandler == NULL) {
4274 *errorHandler = PyCodec_LookupError(errors);
4275 if (*errorHandler == NULL)
4276 goto onError;
4277 }
4278
4279 make_decode_exception(exceptionObject,
4280 encoding,
4281 *input, *inend - *input,
4282 *startinpos, *endinpos,
4283 reason);
4284 if (*exceptionObject == NULL)
4285 goto onError;
4286
4287 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4288 if (restuple == NULL)
4289 goto onError;
4290 if (!PyTuple_Check(restuple)) {
4291 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4292 goto onError;
4293 }
4294 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004295 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004296
4297 /* Copy back the bytes variables, which might have been modified by the
4298 callback */
4299 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4300 if (!inputobj)
4301 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004302 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004303 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004304 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004305 *input = PyBytes_AS_STRING(inputobj);
4306 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004307 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004308 /* we can DECREF safely, as the exception has another reference,
4309 so the object won't go away. */
4310 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004311
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004312 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004313 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004314 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004315 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004316 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004317 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004318
Victor Stinner8f674cc2013-04-17 23:02:17 +02004319 if (PyUnicode_READY(repunicode) < 0)
4320 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004321 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004322 if (replen > 1) {
4323 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004324 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004325 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4326 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4327 goto onError;
4328 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004329 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004330 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004331
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004332 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004333 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004334
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004335 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004336 Py_XDECREF(restuple);
4337 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004338
Benjamin Peterson29060642009-01-31 22:14:21 +00004339 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004340 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004341 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004342}
4343
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004344/* --- UTF-7 Codec -------------------------------------------------------- */
4345
Antoine Pitrou244651a2009-05-04 18:56:13 +00004346/* See RFC2152 for details. We encode conservatively and decode liberally. */
4347
4348/* Three simple macros defining base-64. */
4349
4350/* Is c a base-64 character? */
4351
4352#define IS_BASE64(c) \
4353 (((c) >= 'A' && (c) <= 'Z') || \
4354 ((c) >= 'a' && (c) <= 'z') || \
4355 ((c) >= '0' && (c) <= '9') || \
4356 (c) == '+' || (c) == '/')
4357
4358/* given that c is a base-64 character, what is its base-64 value? */
4359
4360#define FROM_BASE64(c) \
4361 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4362 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4363 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4364 (c) == '+' ? 62 : 63)
4365
4366/* What is the base-64 character of the bottom 6 bits of n? */
4367
4368#define TO_BASE64(n) \
4369 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4370
4371/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4372 * decoded as itself. We are permissive on decoding; the only ASCII
4373 * byte not decoding to itself is the + which begins a base64
4374 * string. */
4375
4376#define DECODE_DIRECT(c) \
4377 ((c) <= 127 && (c) != '+')
4378
4379/* The UTF-7 encoder treats ASCII characters differently according to
4380 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4381 * the above). See RFC2152. This array identifies these different
4382 * sets:
4383 * 0 : "Set D"
4384 * alphanumeric and '(),-./:?
4385 * 1 : "Set O"
4386 * !"#$%&*;<=>@[]^_`{|}
4387 * 2 : "whitespace"
4388 * ht nl cr sp
4389 * 3 : special (must be base64 encoded)
4390 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4391 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004392
Tim Petersced69f82003-09-16 20:30:58 +00004393static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004394char utf7_category[128] = {
4395/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4396 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4397/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4398 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4399/* sp ! " # $ % & ' ( ) * + , - . / */
4400 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4401/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4402 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4403/* @ A B C D E F G H I J K L M N O */
4404 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4405/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4406 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4407/* ` a b c d e f g h i j k l m n o */
4408 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4409/* p q r s t u v w x y z { | } ~ del */
4410 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004411};
4412
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413/* ENCODE_DIRECT: this character should be encoded as itself. The
4414 * answer depends on whether we are encoding set O as itself, and also
4415 * on whether we are encoding whitespace as itself. RFC2152 makes it
4416 * clear that the answers to these questions vary between
4417 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004418
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419#define ENCODE_DIRECT(c, directO, directWS) \
4420 ((c) < 128 && (c) > 0 && \
4421 ((utf7_category[(c)] == 0) || \
4422 (directWS && (utf7_category[(c)] == 2)) || \
4423 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004424
Alexander Belopolsky40018472011-02-26 01:02:56 +00004425PyObject *
4426PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004427 Py_ssize_t size,
4428 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004429{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004430 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4431}
4432
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433/* The decoder. The only state we preserve is our read position,
4434 * i.e. how many characters we have consumed. So if we end in the
4435 * middle of a shift sequence we have to back off the read position
4436 * and the output to the beginning of the sequence, otherwise we lose
4437 * all the shift state (seen bits, number of bits seen, high
4438 * surrogate). */
4439
Alexander Belopolsky40018472011-02-26 01:02:56 +00004440PyObject *
4441PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004442 Py_ssize_t size,
4443 const char *errors,
4444 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004445{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004447 Py_ssize_t startinpos;
4448 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004449 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004450 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004451 const char *errmsg = "";
4452 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004453 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004454 unsigned int base64bits = 0;
4455 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004456 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004457 PyObject *errorHandler = NULL;
4458 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004459
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004460 if (size == 0) {
4461 if (consumed)
4462 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004463 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004464 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004465
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004466 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004467 _PyUnicodeWriter_Init(&writer);
4468 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004469
4470 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004471 e = s + size;
4472
4473 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004474 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004475 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004476 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004477
Antoine Pitrou244651a2009-05-04 18:56:13 +00004478 if (inShift) { /* in a base-64 section */
4479 if (IS_BASE64(ch)) { /* consume a base-64 character */
4480 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4481 base64bits += 6;
4482 s++;
4483 if (base64bits >= 16) {
4484 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004485 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004486 base64bits -= 16;
4487 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004488 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004489 if (surrogate) {
4490 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004491 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4492 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004493 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004494 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004495 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004496 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004497 }
4498 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004499 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004500 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 }
4503 }
Victor Stinner551ac952011-11-29 22:58:13 +01004504 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004505 /* first surrogate */
4506 surrogate = outCh;
4507 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004508 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004509 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004510 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004511 }
4512 }
4513 }
4514 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004515 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004516 if (base64bits > 0) { /* left-over bits */
4517 if (base64bits >= 6) {
4518 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004519 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520 errmsg = "partial character in shift sequence";
4521 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004522 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004523 else {
4524 /* Some bits remain; they should be zero */
4525 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004526 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004527 errmsg = "non-zero padding bits in shift sequence";
4528 goto utf7Error;
4529 }
4530 }
4531 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004532 if (surrogate && DECODE_DIRECT(ch)) {
4533 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4534 goto onError;
4535 }
4536 surrogate = 0;
4537 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004538 /* '-' is absorbed; other terminating
4539 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004540 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542 }
4543 }
4544 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004546 s++; /* consume '+' */
4547 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004548 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004549 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004550 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 }
4552 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004553 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004554 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004555 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004556 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004557 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004558 }
4559 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004560 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004562 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004563 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004564 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565 else {
4566 startinpos = s-starts;
4567 s++;
4568 errmsg = "unexpected special character";
4569 goto utf7Error;
4570 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004571 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004574 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004575 errors, &errorHandler,
4576 "utf7", errmsg,
4577 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004578 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004579 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004580 }
4581
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 /* end of string */
4583
4584 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4585 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004586 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004587 if (surrogate ||
4588 (base64bits >= 6) ||
4589 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004591 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592 errors, &errorHandler,
4593 "utf7", "unterminated shift sequence",
4594 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004595 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004596 goto onError;
4597 if (s < e)
4598 goto restart;
4599 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004600 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004601
4602 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004603 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004604 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004605 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004606 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004607 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004608 writer.kind, writer.data, shiftOutStart);
4609 Py_XDECREF(errorHandler);
4610 Py_XDECREF(exc);
4611 _PyUnicodeWriter_Dealloc(&writer);
4612 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004613 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004614 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 }
4616 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004617 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004619 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004621 Py_XDECREF(errorHandler);
4622 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004623 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004624
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626 Py_XDECREF(errorHandler);
4627 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004628 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004629 return NULL;
4630}
4631
4632
Alexander Belopolsky40018472011-02-26 01:02:56 +00004633PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004634_PyUnicode_EncodeUTF7(PyObject *str,
4635 int base64SetO,
4636 int base64WhiteSpace,
4637 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004638{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004639 int kind;
4640 void *data;
4641 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004642 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004643 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004644 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004645 unsigned int base64bits = 0;
4646 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004647 char * out;
4648 char * start;
4649
Benjamin Petersonbac79492012-01-14 13:34:47 -05004650 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004651 return NULL;
4652 kind = PyUnicode_KIND(str);
4653 data = PyUnicode_DATA(str);
4654 len = PyUnicode_GET_LENGTH(str);
4655
4656 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004658
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004659 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004660 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004661 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004662 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004663 if (v == NULL)
4664 return NULL;
4665
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004666 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004667 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004668 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004669
Antoine Pitrou244651a2009-05-04 18:56:13 +00004670 if (inShift) {
4671 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4672 /* shifting out */
4673 if (base64bits) { /* output remaining bits */
4674 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4675 base64buffer = 0;
4676 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004677 }
4678 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004679 /* Characters not in the BASE64 set implicitly unshift the sequence
4680 so no '-' is required, except if the character is itself a '-' */
4681 if (IS_BASE64(ch) || ch == '-') {
4682 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004683 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004684 *out++ = (char) ch;
4685 }
4686 else {
4687 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004688 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004689 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004690 else { /* not in a shift sequence */
4691 if (ch == '+') {
4692 *out++ = '+';
4693 *out++ = '-';
4694 }
4695 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4696 *out++ = (char) ch;
4697 }
4698 else {
4699 *out++ = '+';
4700 inShift = 1;
4701 goto encode_char;
4702 }
4703 }
4704 continue;
4705encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004706 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004707 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004708
Antoine Pitrou244651a2009-05-04 18:56:13 +00004709 /* code first surrogate */
4710 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004711 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004712 while (base64bits >= 6) {
4713 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4714 base64bits -= 6;
4715 }
4716 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004717 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004718 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004719 base64bits += 16;
4720 base64buffer = (base64buffer << 16) | ch;
4721 while (base64bits >= 6) {
4722 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4723 base64bits -= 6;
4724 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004725 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004726 if (base64bits)
4727 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4728 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004729 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004730 if (_PyBytes_Resize(&v, out - start) < 0)
4731 return NULL;
4732 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004733}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004734PyObject *
4735PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4736 Py_ssize_t size,
4737 int base64SetO,
4738 int base64WhiteSpace,
4739 const char *errors)
4740{
4741 PyObject *result;
4742 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4743 if (tmp == NULL)
4744 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004745 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004746 base64WhiteSpace, errors);
4747 Py_DECREF(tmp);
4748 return result;
4749}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004750
Antoine Pitrou244651a2009-05-04 18:56:13 +00004751#undef IS_BASE64
4752#undef FROM_BASE64
4753#undef TO_BASE64
4754#undef DECODE_DIRECT
4755#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004756
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757/* --- UTF-8 Codec -------------------------------------------------------- */
4758
Alexander Belopolsky40018472011-02-26 01:02:56 +00004759PyObject *
4760PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004761 Py_ssize_t size,
4762 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763{
Walter Dörwald69652032004-09-07 20:24:22 +00004764 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4765}
4766
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004767#include "stringlib/asciilib.h"
4768#include "stringlib/codecs.h"
4769#include "stringlib/undef.h"
4770
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004771#include "stringlib/ucs1lib.h"
4772#include "stringlib/codecs.h"
4773#include "stringlib/undef.h"
4774
4775#include "stringlib/ucs2lib.h"
4776#include "stringlib/codecs.h"
4777#include "stringlib/undef.h"
4778
4779#include "stringlib/ucs4lib.h"
4780#include "stringlib/codecs.h"
4781#include "stringlib/undef.h"
4782
Antoine Pitrouab868312009-01-10 15:40:25 +00004783/* Mask to quickly check whether a C 'long' contains a
4784 non-ASCII, UTF8-encoded char. */
4785#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004786# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004787#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004788# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004789#else
4790# error C 'long' size should be either 4 or 8!
4791#endif
4792
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004793static Py_ssize_t
4794ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004795{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004796 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004797 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004798
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004799 /*
4800 * Issue #17237: m68k is a bit different from most architectures in
4801 * that objects do not use "natural alignment" - for example, int and
4802 * long are only aligned at 2-byte boundaries. Therefore the assert()
4803 * won't work; also, tests have shown that skipping the "optimised
4804 * version" will even speed up m68k.
4805 */
4806#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004807#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004808 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4809 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004810 /* Fast path, see in STRINGLIB(utf8_decode) for
4811 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004812 /* Help allocation */
4813 const char *_p = p;
4814 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004815 while (_p < aligned_end) {
4816 unsigned long value = *(const unsigned long *) _p;
4817 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004819 *((unsigned long *)q) = value;
4820 _p += SIZEOF_LONG;
4821 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004822 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004823 p = _p;
4824 while (p < end) {
4825 if ((unsigned char)*p & 0x80)
4826 break;
4827 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004829 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004831#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004832#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004833 while (p < end) {
4834 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4835 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004836 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004837 /* Help allocation */
4838 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004839 while (_p < aligned_end) {
4840 unsigned long value = *(unsigned long *) _p;
4841 if (value & ASCII_CHAR_MASK)
4842 break;
4843 _p += SIZEOF_LONG;
4844 }
4845 p = _p;
4846 if (_p == end)
4847 break;
4848 }
4849 if ((unsigned char)*p & 0x80)
4850 break;
4851 ++p;
4852 }
4853 memcpy(dest, start, p - start);
4854 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855}
Antoine Pitrouab868312009-01-10 15:40:25 +00004856
Victor Stinner785938e2011-12-11 20:09:03 +01004857PyObject *
4858PyUnicode_DecodeUTF8Stateful(const char *s,
4859 Py_ssize_t size,
4860 const char *errors,
4861 Py_ssize_t *consumed)
4862{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004863 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004864 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004865 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004866
4867 Py_ssize_t startinpos;
4868 Py_ssize_t endinpos;
4869 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004870 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004871 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004872 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004873
4874 if (size == 0) {
4875 if (consumed)
4876 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004877 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004878 }
4879
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004880 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4881 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004882 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004883 *consumed = 1;
4884 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004885 }
4886
Victor Stinner8f674cc2013-04-17 23:02:17 +02004887 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004888 writer.min_length = size;
4889 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004890 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004891
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004892 writer.pos = ascii_decode(s, end, writer.data);
4893 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004894 while (s < end) {
4895 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004896 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004897
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004898 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004899 if (PyUnicode_IS_ASCII(writer.buffer))
4900 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004901 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004902 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004903 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004904 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 } else {
4906 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004907 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004908 }
4909
4910 switch (ch) {
4911 case 0:
4912 if (s == end || consumed)
4913 goto End;
4914 errmsg = "unexpected end of data";
4915 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004916 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004917 break;
4918 case 1:
4919 errmsg = "invalid start byte";
4920 startinpos = s - starts;
4921 endinpos = startinpos + 1;
4922 break;
4923 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004924 case 3:
4925 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004926 errmsg = "invalid continuation byte";
4927 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004928 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004929 break;
4930 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004931 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004932 goto onError;
4933 continue;
4934 }
4935
Victor Stinner1d65d912015-10-05 13:43:50 +02004936 if (error_handler == _Py_ERROR_UNKNOWN)
4937 error_handler = get_error_handler(errors);
4938
4939 switch (error_handler) {
4940 case _Py_ERROR_IGNORE:
4941 s += (endinpos - startinpos);
4942 break;
4943
4944 case _Py_ERROR_REPLACE:
4945 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4946 goto onError;
4947 s += (endinpos - startinpos);
4948 break;
4949
4950 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004951 {
4952 Py_ssize_t i;
4953
Victor Stinner1d65d912015-10-05 13:43:50 +02004954 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4955 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004956 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004957 ch = (Py_UCS4)(unsigned char)(starts[i]);
4958 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4959 ch + 0xdc00);
4960 writer.pos++;
4961 }
4962 s += (endinpos - startinpos);
4963 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004964 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004965
4966 default:
4967 if (unicode_decode_call_errorhandler_writer(
4968 errors, &error_handler_obj,
4969 "utf-8", errmsg,
4970 &starts, &end, &startinpos, &endinpos, &exc, &s,
4971 &writer))
4972 goto onError;
4973 }
Victor Stinner785938e2011-12-11 20:09:03 +01004974 }
4975
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004976End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004977 if (consumed)
4978 *consumed = s - starts;
4979
Victor Stinner1d65d912015-10-05 13:43:50 +02004980 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004981 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004982 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004983
4984onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004985 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004986 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004987 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004988 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004989}
4990
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004991#ifdef __APPLE__
4992
4993/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004994 used to decode the command line arguments on Mac OS X.
4995
4996 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004997 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004998
4999wchar_t*
5000_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5001{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005002 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 wchar_t *unicode;
5004 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005005
5006 /* Note: size will always be longer than the resulting Unicode
5007 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005008 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005009 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005010 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005011 if (!unicode)
5012 return NULL;
5013
5014 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005015 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005016 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005017 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005018 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005019#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005020 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005021#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005022 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005023#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005024 if (ch > 0xFF) {
5025#if SIZEOF_WCHAR_T == 4
5026 assert(0);
5027#else
5028 assert(Py_UNICODE_IS_SURROGATE(ch));
5029 /* compute and append the two surrogates: */
5030 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5031 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5032#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005033 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005034 else {
5035 if (!ch && s == e)
5036 break;
5037 /* surrogateescape */
5038 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5039 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005040 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005041 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005042 return unicode;
5043}
5044
5045#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005047/* Primary internal function which creates utf8 encoded bytes objects.
5048
5049 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005050 and allocate exactly as much space needed at the end. Else allocate the
5051 maximum possible needed (4 result bytes per Unicode character), and return
5052 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005053*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005054PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005055_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056{
Victor Stinner6099a032011-12-18 14:22:26 +01005057 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005058 void *data;
5059 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005061 if (!PyUnicode_Check(unicode)) {
5062 PyErr_BadArgument();
5063 return NULL;
5064 }
5065
5066 if (PyUnicode_READY(unicode) == -1)
5067 return NULL;
5068
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005069 if (PyUnicode_UTF8(unicode))
5070 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5071 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005072
5073 kind = PyUnicode_KIND(unicode);
5074 data = PyUnicode_DATA(unicode);
5075 size = PyUnicode_GET_LENGTH(unicode);
5076
Benjamin Petersonead6b532011-12-20 17:23:42 -06005077 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005078 default:
5079 assert(0);
5080 case PyUnicode_1BYTE_KIND:
5081 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5082 assert(!PyUnicode_IS_ASCII(unicode));
5083 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5084 case PyUnicode_2BYTE_KIND:
5085 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5086 case PyUnicode_4BYTE_KIND:
5087 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005088 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089}
5090
Alexander Belopolsky40018472011-02-26 01:02:56 +00005091PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005092PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5093 Py_ssize_t size,
5094 const char *errors)
5095{
5096 PyObject *v, *unicode;
5097
5098 unicode = PyUnicode_FromUnicode(s, size);
5099 if (unicode == NULL)
5100 return NULL;
5101 v = _PyUnicode_AsUTF8String(unicode, errors);
5102 Py_DECREF(unicode);
5103 return v;
5104}
5105
5106PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005107PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005109 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110}
5111
Walter Dörwald41980ca2007-08-16 21:55:45 +00005112/* --- UTF-32 Codec ------------------------------------------------------- */
5113
5114PyObject *
5115PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 Py_ssize_t size,
5117 const char *errors,
5118 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005119{
5120 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5121}
5122
5123PyObject *
5124PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005125 Py_ssize_t size,
5126 const char *errors,
5127 int *byteorder,
5128 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005129{
5130 const char *starts = s;
5131 Py_ssize_t startinpos;
5132 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005133 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005134 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005135 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005136 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005137 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005138 PyObject *errorHandler = NULL;
5139 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005140
Walter Dörwald41980ca2007-08-16 21:55:45 +00005141 q = (unsigned char *)s;
5142 e = q + size;
5143
5144 if (byteorder)
5145 bo = *byteorder;
5146
5147 /* Check for BOM marks (U+FEFF) in the input and adjust current
5148 byte order setting accordingly. In native mode, the leading BOM
5149 mark is skipped, in all other modes, it is copied to the output
5150 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005151 if (bo == 0 && size >= 4) {
5152 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5153 if (bom == 0x0000FEFF) {
5154 bo = -1;
5155 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005156 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005157 else if (bom == 0xFFFE0000) {
5158 bo = 1;
5159 q += 4;
5160 }
5161 if (byteorder)
5162 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005163 }
5164
Victor Stinnere64322e2012-10-30 23:12:47 +01005165 if (q == e) {
5166 if (consumed)
5167 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005168 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005169 }
5170
Victor Stinnere64322e2012-10-30 23:12:47 +01005171#ifdef WORDS_BIGENDIAN
5172 le = bo < 0;
5173#else
5174 le = bo <= 0;
5175#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005176 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005177
Victor Stinner8f674cc2013-04-17 23:02:17 +02005178 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005179 writer.min_length = (e - q + 3) / 4;
5180 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005181 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005182
Victor Stinnere64322e2012-10-30 23:12:47 +01005183 while (1) {
5184 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005185 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005186
Victor Stinnere64322e2012-10-30 23:12:47 +01005187 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005188 enum PyUnicode_Kind kind = writer.kind;
5189 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005190 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005191 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005192 if (le) {
5193 do {
5194 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5195 if (ch > maxch)
5196 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005197 if (kind != PyUnicode_1BYTE_KIND &&
5198 Py_UNICODE_IS_SURROGATE(ch))
5199 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005200 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005201 q += 4;
5202 } while (q <= last);
5203 }
5204 else {
5205 do {
5206 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5207 if (ch > maxch)
5208 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005209 if (kind != PyUnicode_1BYTE_KIND &&
5210 Py_UNICODE_IS_SURROGATE(ch))
5211 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005212 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005213 q += 4;
5214 } while (q <= last);
5215 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005216 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005217 }
5218
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005219 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005220 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005221 startinpos = ((const char *)q) - starts;
5222 endinpos = startinpos + 4;
5223 }
5224 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005225 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005226 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005227 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005229 startinpos = ((const char *)q) - starts;
5230 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005232 else {
5233 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005234 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005235 goto onError;
5236 q += 4;
5237 continue;
5238 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005239 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005240 startinpos = ((const char *)q) - starts;
5241 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005243
5244 /* The remaining input chars are ignored if the callback
5245 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005246 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005247 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005248 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005250 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005252 }
5253
Walter Dörwald41980ca2007-08-16 21:55:45 +00005254 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005255 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005256
Walter Dörwald41980ca2007-08-16 21:55:45 +00005257 Py_XDECREF(errorHandler);
5258 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005259 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005260
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005262 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005263 Py_XDECREF(errorHandler);
5264 Py_XDECREF(exc);
5265 return NULL;
5266}
5267
5268PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005269_PyUnicode_EncodeUTF32(PyObject *str,
5270 const char *errors,
5271 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005272{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005273 enum PyUnicode_Kind kind;
5274 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005275 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005276 PyObject *v;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005277 PY_UINT32_T *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005278#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005279 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005280#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005281 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005282#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005283 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005284 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005285 PyObject *errorHandler = NULL;
5286 PyObject *exc = NULL;
5287 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005288
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005289 if (!PyUnicode_Check(str)) {
5290 PyErr_BadArgument();
5291 return NULL;
5292 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005293 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005294 return NULL;
5295 kind = PyUnicode_KIND(str);
5296 data = PyUnicode_DATA(str);
5297 len = PyUnicode_GET_LENGTH(str);
5298
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005299 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005300 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005301 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005302 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005303 if (v == NULL)
5304 return NULL;
5305
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005306 /* output buffer is 4-bytes aligned */
5307 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5308 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005309 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005310 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005311 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005312 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005313
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005314 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005315 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005316 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005317 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005318 else
5319 encoding = "utf-32";
5320
5321 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005322 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5323 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005324 }
5325
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005326 pos = 0;
5327 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005328 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005329
5330 if (kind == PyUnicode_2BYTE_KIND) {
5331 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5332 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005333 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005334 else {
5335 assert(kind == PyUnicode_4BYTE_KIND);
5336 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5337 &out, native_ordering);
5338 }
5339 if (pos == len)
5340 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005341
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005342 rep = unicode_encode_call_errorhandler(
5343 errors, &errorHandler,
5344 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005345 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005346 if (!rep)
5347 goto error;
5348
5349 if (PyBytes_Check(rep)) {
5350 repsize = PyBytes_GET_SIZE(rep);
5351 if (repsize & 3) {
5352 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005353 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005354 "surrogates not allowed");
5355 goto error;
5356 }
5357 moreunits = repsize / 4;
5358 }
5359 else {
5360 assert(PyUnicode_Check(rep));
5361 if (PyUnicode_READY(rep) < 0)
5362 goto error;
5363 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5364 if (!PyUnicode_IS_ASCII(rep)) {
5365 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005366 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005367 "surrogates not allowed");
5368 goto error;
5369 }
5370 }
5371
5372 /* four bytes are reserved for each surrogate */
5373 if (moreunits > 1) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005374 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005375 Py_ssize_t morebytes = 4 * (moreunits - 1);
5376 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5377 /* integer overflow */
5378 PyErr_NoMemory();
5379 goto error;
5380 }
5381 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5382 goto error;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005383 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005384 }
5385
5386 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005387 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5388 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005389 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005390 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005391 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5392 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005393 }
5394
5395 Py_CLEAR(rep);
5396 }
5397
5398 /* Cut back to size actually needed. This is necessary for, for example,
5399 encoding of a string containing isolated surrogates and the 'ignore'
5400 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005401 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005402 if (nsize != PyBytes_GET_SIZE(v))
5403 _PyBytes_Resize(&v, nsize);
5404 Py_XDECREF(errorHandler);
5405 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005406 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005407 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005408 error:
5409 Py_XDECREF(rep);
5410 Py_XDECREF(errorHandler);
5411 Py_XDECREF(exc);
5412 Py_XDECREF(v);
5413 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005414}
5415
Alexander Belopolsky40018472011-02-26 01:02:56 +00005416PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005417PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5418 Py_ssize_t size,
5419 const char *errors,
5420 int byteorder)
5421{
5422 PyObject *result;
5423 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5424 if (tmp == NULL)
5425 return NULL;
5426 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5427 Py_DECREF(tmp);
5428 return result;
5429}
5430
5431PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005432PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005433{
Victor Stinnerb960b342011-11-20 19:12:52 +01005434 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005435}
5436
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437/* --- UTF-16 Codec ------------------------------------------------------- */
5438
Tim Peters772747b2001-08-09 22:21:55 +00005439PyObject *
5440PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 Py_ssize_t size,
5442 const char *errors,
5443 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444{
Walter Dörwald69652032004-09-07 20:24:22 +00005445 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5446}
5447
5448PyObject *
5449PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005450 Py_ssize_t size,
5451 const char *errors,
5452 int *byteorder,
5453 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005454{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005455 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005456 Py_ssize_t startinpos;
5457 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005458 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005459 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005460 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005461 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005462 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005463 PyObject *errorHandler = NULL;
5464 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005465 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466
Tim Peters772747b2001-08-09 22:21:55 +00005467 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005468 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469
5470 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005471 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005473 /* Check for BOM marks (U+FEFF) in the input and adjust current
5474 byte order setting accordingly. In native mode, the leading BOM
5475 mark is skipped, in all other modes, it is copied to the output
5476 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005477 if (bo == 0 && size >= 2) {
5478 const Py_UCS4 bom = (q[1] << 8) | q[0];
5479 if (bom == 0xFEFF) {
5480 q += 2;
5481 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005483 else if (bom == 0xFFFE) {
5484 q += 2;
5485 bo = 1;
5486 }
5487 if (byteorder)
5488 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490
Antoine Pitrou63065d72012-05-15 23:48:04 +02005491 if (q == e) {
5492 if (consumed)
5493 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005494 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005495 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005496
Christian Heimes743e0cd2012-10-17 23:52:17 +02005497#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005498 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005499 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005500#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005501 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005502 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005503#endif
Tim Peters772747b2001-08-09 22:21:55 +00005504
Antoine Pitrou63065d72012-05-15 23:48:04 +02005505 /* Note: size will always be longer than the resulting Unicode
5506 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005507 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005508 writer.min_length = (e - q + 1) / 2;
5509 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005510 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005511
Antoine Pitrou63065d72012-05-15 23:48:04 +02005512 while (1) {
5513 Py_UCS4 ch = 0;
5514 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005515 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005516 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005517 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005518 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005519 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005520 native_ordering);
5521 else
5522 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005523 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005524 native_ordering);
5525 } else if (kind == PyUnicode_2BYTE_KIND) {
5526 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005527 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005528 native_ordering);
5529 } else {
5530 assert(kind == PyUnicode_4BYTE_KIND);
5531 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005532 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005533 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005534 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005535 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005536
Antoine Pitrou63065d72012-05-15 23:48:04 +02005537 switch (ch)
5538 {
5539 case 0:
5540 /* remaining byte at the end? (size should be even) */
5541 if (q == e || consumed)
5542 goto End;
5543 errmsg = "truncated data";
5544 startinpos = ((const char *)q) - starts;
5545 endinpos = ((const char *)e) - starts;
5546 break;
5547 /* The remaining input chars are ignored if the callback
5548 chooses to skip the input */
5549 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005550 q -= 2;
5551 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005552 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005553 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005554 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005555 endinpos = ((const char *)e) - starts;
5556 break;
5557 case 2:
5558 errmsg = "illegal encoding";
5559 startinpos = ((const char *)q) - 2 - starts;
5560 endinpos = startinpos + 2;
5561 break;
5562 case 3:
5563 errmsg = "illegal UTF-16 surrogate";
5564 startinpos = ((const char *)q) - 4 - starts;
5565 endinpos = startinpos + 2;
5566 break;
5567 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005568 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005569 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 continue;
5571 }
5572
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005573 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005574 errors,
5575 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005576 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005577 &starts,
5578 (const char **)&e,
5579 &startinpos,
5580 &endinpos,
5581 &exc,
5582 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005583 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005584 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585 }
5586
Antoine Pitrou63065d72012-05-15 23:48:04 +02005587End:
Walter Dörwald69652032004-09-07 20:24:22 +00005588 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005590
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591 Py_XDECREF(errorHandler);
5592 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005593 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005596 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005597 Py_XDECREF(errorHandler);
5598 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 return NULL;
5600}
5601
Tim Peters772747b2001-08-09 22:21:55 +00005602PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005603_PyUnicode_EncodeUTF16(PyObject *str,
5604 const char *errors,
5605 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005607 enum PyUnicode_Kind kind;
5608 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005609 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005610 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005611 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005612 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005613#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005614 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005615#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005616 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005617#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005618 const char *encoding;
5619 Py_ssize_t nsize, pos;
5620 PyObject *errorHandler = NULL;
5621 PyObject *exc = NULL;
5622 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005623
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005624 if (!PyUnicode_Check(str)) {
5625 PyErr_BadArgument();
5626 return NULL;
5627 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005628 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005629 return NULL;
5630 kind = PyUnicode_KIND(str);
5631 data = PyUnicode_DATA(str);
5632 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005633
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005634 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005635 if (kind == PyUnicode_4BYTE_KIND) {
5636 const Py_UCS4 *in = (const Py_UCS4 *)data;
5637 const Py_UCS4 *end = in + len;
5638 while (in < end)
5639 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005640 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005641 }
5642 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005643 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005644 nsize = len + pairs + (byteorder == 0);
5645 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 if (v == NULL)
5647 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005649 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005650 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005651 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005653 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005654 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005655 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005656
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005657 if (kind == PyUnicode_1BYTE_KIND) {
5658 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5659 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005660 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005661
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005662 if (byteorder < 0)
5663 encoding = "utf-16-le";
5664 else if (byteorder > 0)
5665 encoding = "utf-16-be";
5666 else
5667 encoding = "utf-16";
5668
5669 pos = 0;
5670 while (pos < len) {
5671 Py_ssize_t repsize, moreunits;
5672
5673 if (kind == PyUnicode_2BYTE_KIND) {
5674 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5675 &out, native_ordering);
5676 }
5677 else {
5678 assert(kind == PyUnicode_4BYTE_KIND);
5679 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5680 &out, native_ordering);
5681 }
5682 if (pos == len)
5683 break;
5684
5685 rep = unicode_encode_call_errorhandler(
5686 errors, &errorHandler,
5687 encoding, "surrogates not allowed",
5688 str, &exc, pos, pos + 1, &pos);
5689 if (!rep)
5690 goto error;
5691
5692 if (PyBytes_Check(rep)) {
5693 repsize = PyBytes_GET_SIZE(rep);
5694 if (repsize & 1) {
5695 raise_encode_exception(&exc, encoding,
5696 str, pos - 1, pos,
5697 "surrogates not allowed");
5698 goto error;
5699 }
5700 moreunits = repsize / 2;
5701 }
5702 else {
5703 assert(PyUnicode_Check(rep));
5704 if (PyUnicode_READY(rep) < 0)
5705 goto error;
5706 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5707 if (!PyUnicode_IS_ASCII(rep)) {
5708 raise_encode_exception(&exc, encoding,
5709 str, pos - 1, pos,
5710 "surrogates not allowed");
5711 goto error;
5712 }
5713 }
5714
5715 /* two bytes are reserved for each surrogate */
5716 if (moreunits > 1) {
5717 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5718 Py_ssize_t morebytes = 2 * (moreunits - 1);
5719 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5720 /* integer overflow */
5721 PyErr_NoMemory();
5722 goto error;
5723 }
5724 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5725 goto error;
5726 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5727 }
5728
5729 if (PyBytes_Check(rep)) {
5730 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5731 out += moreunits;
5732 } else /* rep is unicode */ {
5733 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5734 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5735 &out, native_ordering);
5736 }
5737
5738 Py_CLEAR(rep);
5739 }
5740
5741 /* Cut back to size actually needed. This is necessary for, for example,
5742 encoding of a string containing isolated surrogates and the 'ignore' handler
5743 is used. */
5744 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5745 if (nsize != PyBytes_GET_SIZE(v))
5746 _PyBytes_Resize(&v, nsize);
5747 Py_XDECREF(errorHandler);
5748 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005749 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005750 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005751 error:
5752 Py_XDECREF(rep);
5753 Py_XDECREF(errorHandler);
5754 Py_XDECREF(exc);
5755 Py_XDECREF(v);
5756 return NULL;
5757#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758}
5759
Alexander Belopolsky40018472011-02-26 01:02:56 +00005760PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005761PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5762 Py_ssize_t size,
5763 const char *errors,
5764 int byteorder)
5765{
5766 PyObject *result;
5767 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5768 if (tmp == NULL)
5769 return NULL;
5770 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5771 Py_DECREF(tmp);
5772 return result;
5773}
5774
5775PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005776PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005778 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779}
5780
5781/* --- Unicode Escape Codec ----------------------------------------------- */
5782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005783/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5784 if all the escapes in the string make it still a valid ASCII string.
5785 Returns -1 if any escapes were found which cause the string to
5786 pop out of ASCII range. Otherwise returns the length of the
5787 required buffer to hold the string.
5788 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005789static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005790length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5791{
5792 const unsigned char *p = (const unsigned char *)s;
5793 const unsigned char *end = p + size;
5794 Py_ssize_t length = 0;
5795
5796 if (size < 0)
5797 return -1;
5798
5799 for (; p < end; ++p) {
5800 if (*p > 127) {
5801 /* Non-ASCII */
5802 return -1;
5803 }
5804 else if (*p != '\\') {
5805 /* Normal character */
5806 ++length;
5807 }
5808 else {
5809 /* Backslash-escape, check next char */
5810 ++p;
5811 /* Escape sequence reaches till end of string or
5812 non-ASCII follow-up. */
5813 if (p >= end || *p > 127)
5814 return -1;
5815 switch (*p) {
5816 case '\n':
5817 /* backslash + \n result in zero characters */
5818 break;
5819 case '\\': case '\'': case '\"':
5820 case 'b': case 'f': case 't':
5821 case 'n': case 'r': case 'v': case 'a':
5822 ++length;
5823 break;
5824 case '0': case '1': case '2': case '3':
5825 case '4': case '5': case '6': case '7':
5826 case 'x': case 'u': case 'U': case 'N':
5827 /* these do not guarantee ASCII characters */
5828 return -1;
5829 default:
5830 /* count the backslash + the other character */
5831 length += 2;
5832 }
5833 }
5834 }
5835 return length;
5836}
5837
Fredrik Lundh06d12682001-01-24 07:59:11 +00005838static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005839
Alexander Belopolsky40018472011-02-26 01:02:56 +00005840PyObject *
5841PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005842 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005843 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005845 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005846 Py_ssize_t startinpos;
5847 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005848 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005850 char* message;
5851 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005852 PyObject *errorHandler = NULL;
5853 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005854 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005855
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005856 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005857 if (len == 0)
5858 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005859
5860 /* After length_of_escaped_ascii_string() there are two alternatives,
5861 either the string is pure ASCII with named escapes like \n, etc.
5862 and we determined it's exact size (common case)
5863 or it contains \x, \u, ... escape sequences. then we create a
5864 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005865 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005866 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005867 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005868 }
5869 else {
5870 /* Escaped strings will always be longer than the resulting
5871 Unicode string, so we start with size here and then reduce the
5872 length after conversion to the true value.
5873 (but if the error callback returns a long replacement string
5874 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005875 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005876 }
5877
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005879 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005881
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 while (s < end) {
5883 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005884 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005885 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886
5887 /* Non-escape characters are interpreted as Unicode ordinals */
5888 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005889 x = (unsigned char)*s;
5890 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005891 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005892 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 continue;
5894 }
5895
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005896 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 /* \ - Escapes */
5898 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005899 c = *s++;
5900 if (s > end)
5901 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005902
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005903 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005906#define WRITECHAR(ch) \
5907 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005908 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005909 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005910 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005911
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005913 case '\\': WRITECHAR('\\'); break;
5914 case '\'': WRITECHAR('\''); break;
5915 case '\"': WRITECHAR('\"'); break;
5916 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005917 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005918 case 'f': WRITECHAR('\014'); break;
5919 case 't': WRITECHAR('\t'); break;
5920 case 'n': WRITECHAR('\n'); break;
5921 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005922 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005923 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005924 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005925 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 case '0': case '1': case '2': case '3':
5929 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005930 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005931 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005932 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005933 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005934 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005936 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 break;
5938
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 /* hex escapes */
5940 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005942 digits = 2;
5943 message = "truncated \\xXX escape";
5944 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005948 digits = 4;
5949 message = "truncated \\uXXXX escape";
5950 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951
Benjamin Peterson29060642009-01-31 22:14:21 +00005952 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005953 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005954 digits = 8;
5955 message = "truncated \\UXXXXXXXX escape";
5956 hexescape:
5957 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005958 if (end - s < digits) {
5959 /* count only hex digits */
5960 for (; s < end; ++s) {
5961 c = (unsigned char)*s;
5962 if (!Py_ISXDIGIT(c))
5963 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005964 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005965 goto error;
5966 }
5967 for (; digits--; ++s) {
5968 c = (unsigned char)*s;
5969 if (!Py_ISXDIGIT(c))
5970 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005971 chr = (chr<<4) & ~0xF;
5972 if (c >= '0' && c <= '9')
5973 chr += c - '0';
5974 else if (c >= 'a' && c <= 'f')
5975 chr += 10 + c - 'a';
5976 else
5977 chr += 10 + c - 'A';
5978 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005979 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005980 /* _decoding_error will have already written into the
5981 target buffer. */
5982 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005983 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005984 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005985 message = "illegal Unicode character";
5986 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005987 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005988 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005989 break;
5990
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005992 case 'N':
5993 message = "malformed \\N character escape";
5994 if (ucnhash_CAPI == NULL) {
5995 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005996 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5997 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005998 if (ucnhash_CAPI == NULL)
5999 goto ucnhashError;
6000 }
6001 if (*s == '{') {
6002 const char *start = s+1;
6003 /* look for the closing brace */
6004 while (*s != '}' && s < end)
6005 s++;
6006 if (s > start && s < end && *s == '}') {
6007 /* found a name. look it up in the unicode database */
6008 message = "unknown Unicode character name";
6009 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02006010 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02006011 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006012 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006013 goto store;
6014 }
6015 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006016 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006017
6018 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006019 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006020 message = "\\ at end of string";
6021 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02006022 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00006023 }
6024 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006025 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02006026 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006027 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006028 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006030 continue;
6031
6032 error:
6033 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006034 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006035 errors, &errorHandler,
6036 "unicodeescape", message,
6037 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006038 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02006039 goto onError;
6040 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006042#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006043
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006044 Py_XDECREF(errorHandler);
6045 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006046 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006047
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006049 PyErr_SetString(
6050 PyExc_UnicodeError,
6051 "\\N escapes not supported (can't load unicodedata module)"
6052 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006053 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006054 Py_XDECREF(errorHandler);
6055 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006056 return NULL;
6057
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006059 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006060 Py_XDECREF(errorHandler);
6061 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 return NULL;
6063}
6064
6065/* Return a Unicode-Escape string version of the Unicode object.
6066
6067 If quotes is true, the string is enclosed in u"" or u'' quotes as
6068 appropriate.
6069
6070*/
6071
Alexander Belopolsky40018472011-02-26 01:02:56 +00006072PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006073PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006075 Py_ssize_t i, len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006077 int kind;
6078 void *data;
Victor Stinner358af132015-10-12 22:36:57 +02006079 _PyBytesWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080
Ezio Melottie7f90372012-10-05 03:33:31 +03006081 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006082 escape.
6083
Ezio Melottie7f90372012-10-05 03:33:31 +03006084 For UCS1 strings it's '\xxx', 4 bytes per source character.
6085 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6086 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006087 */
6088
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006089 if (!PyUnicode_Check(unicode)) {
6090 PyErr_BadArgument();
6091 return NULL;
6092 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006093 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006094 return NULL;
Victor Stinner358af132015-10-12 22:36:57 +02006095
6096 _PyBytesWriter_Init(&writer);
6097
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006098 len = PyUnicode_GET_LENGTH(unicode);
6099 kind = PyUnicode_KIND(unicode);
6100 data = PyUnicode_DATA(unicode);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006101
Victor Stinner358af132015-10-12 22:36:57 +02006102 p = _PyBytesWriter_Alloc(&writer, len);
6103 if (p == NULL)
6104 goto error;
6105 writer.overallocate = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006107 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006108 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006109
Walter Dörwald79e913e2007-05-12 11:08:06 +00006110 /* Escape backslashes */
6111 if (ch == '\\') {
Victor Stinner358af132015-10-12 22:36:57 +02006112 /* -1: substract 1 preallocated byte */
6113 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6114 if (p == NULL)
6115 goto error;
6116
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 *p++ = '\\';
6118 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006119 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006120 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006121
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006122 /* Map 21-bit characters to '\U00xxxxxx' */
6123 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006124 assert(ch <= MAX_UNICODE);
Victor Stinner358af132015-10-12 22:36:57 +02006125
6126 p = _PyBytesWriter_Prepare(&writer, p, 10-1);
6127 if (p == NULL)
6128 goto error;
6129
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006130 *p++ = '\\';
6131 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006132 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6133 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6134 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6135 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6136 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6137 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6138 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6139 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006141 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006142
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006144 if (ch >= 256) {
Victor Stinner358af132015-10-12 22:36:57 +02006145 p = _PyBytesWriter_Prepare(&writer, p, 6-1);
6146 if (p == NULL)
6147 goto error;
6148
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 *p++ = '\\';
6150 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006151 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6152 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6153 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6154 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006156
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006157 /* Map special whitespace to '\t', \n', '\r' */
6158 else if (ch == '\t') {
Victor Stinner358af132015-10-12 22:36:57 +02006159 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6160 if (p == NULL)
6161 goto error;
6162
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006163 *p++ = '\\';
6164 *p++ = 't';
6165 }
6166 else if (ch == '\n') {
Victor Stinner358af132015-10-12 22:36:57 +02006167 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6168 if (p == NULL)
6169 goto error;
6170
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006171 *p++ = '\\';
6172 *p++ = 'n';
6173 }
6174 else if (ch == '\r') {
Victor Stinner358af132015-10-12 22:36:57 +02006175 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6176 if (p == NULL)
6177 goto error;
6178
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006179 *p++ = '\\';
6180 *p++ = 'r';
6181 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006182
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006183 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006184 else if (ch < ' ' || ch >= 0x7F) {
Victor Stinner358af132015-10-12 22:36:57 +02006185 /* -1: substract 1 preallocated byte */
6186 p = _PyBytesWriter_Prepare(&writer, p, 4-1);
6187 if (p == NULL)
6188 goto error;
6189
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006191 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006192 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6193 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006194 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006195
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 /* Copy everything else as-is */
6197 else
6198 *p++ = (char) ch;
6199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200
Victor Stinner358af132015-10-12 22:36:57 +02006201 return _PyBytesWriter_Finish(&writer, p);
6202
6203error:
6204 _PyBytesWriter_Dealloc(&writer);
6205 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206}
6207
Alexander Belopolsky40018472011-02-26 01:02:56 +00006208PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006209PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6210 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006212 PyObject *result;
6213 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6214 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006216 result = PyUnicode_AsUnicodeEscapeString(tmp);
6217 Py_DECREF(tmp);
6218 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219}
6220
6221/* --- Raw Unicode Escape Codec ------------------------------------------- */
6222
Alexander Belopolsky40018472011-02-26 01:02:56 +00006223PyObject *
6224PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006225 Py_ssize_t size,
6226 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006228 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006229 Py_ssize_t startinpos;
6230 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006231 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 const char *end;
6233 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006234 PyObject *errorHandler = NULL;
6235 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006236
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006237 if (size == 0)
6238 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006239
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 /* Escaped strings will always be longer than the resulting
6241 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006242 length after conversion to the true value. (But decoding error
6243 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006244 _PyUnicodeWriter_Init(&writer);
6245 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006246
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 end = s + size;
6248 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 unsigned char c;
6250 Py_UCS4 x;
6251 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006252 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 /* Non-escape characters are interpreted as Unicode ordinals */
6255 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006256 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006257 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006258 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006260 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 startinpos = s-starts;
6262
6263 /* \u-escapes are only interpreted iff the number of leading
6264 backslashes if odd */
6265 bs = s;
6266 for (;s < end;) {
6267 if (*s != '\\')
6268 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006269 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006270 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006271 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006272 }
6273 if (((s - bs) & 1) == 0 ||
6274 s >= end ||
6275 (*s != 'u' && *s != 'U')) {
6276 continue;
6277 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006278 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 count = *s=='u' ? 4 : 8;
6280 s++;
6281
6282 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 for (x = 0, i = 0; i < count; ++i, ++s) {
6284 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006285 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006287 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 errors, &errorHandler,
6289 "rawunicodeescape", "truncated \\uXXXX",
6290 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006291 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 goto onError;
6293 goto nextByte;
6294 }
6295 x = (x<<4) & ~0xF;
6296 if (c >= '0' && c <= '9')
6297 x += c - '0';
6298 else if (c >= 'a' && c <= 'f')
6299 x += 10 + c - 'a';
6300 else
6301 x += 10 + c - 'A';
6302 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006303 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006304 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006305 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006306 }
6307 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006308 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006309 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006310 errors, &errorHandler,
6311 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006312 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006313 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006314 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006315 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006316 nextByte:
6317 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006319 Py_XDECREF(errorHandler);
6320 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006321 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006322
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006324 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006325 Py_XDECREF(errorHandler);
6326 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 return NULL;
6328}
6329
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006330
Alexander Belopolsky40018472011-02-26 01:02:56 +00006331PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006332PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334 char *p;
Victor Stinner358af132015-10-12 22:36:57 +02006335 Py_ssize_t pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006336 int kind;
6337 void *data;
6338 Py_ssize_t len;
Victor Stinner358af132015-10-12 22:36:57 +02006339 _PyBytesWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006341 if (!PyUnicode_Check(unicode)) {
6342 PyErr_BadArgument();
6343 return NULL;
6344 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006345 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006346 return NULL;
Victor Stinner358af132015-10-12 22:36:57 +02006347
6348 _PyBytesWriter_Init(&writer);
6349
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006350 kind = PyUnicode_KIND(unicode);
6351 data = PyUnicode_DATA(unicode);
6352 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner0e368262011-11-10 20:12:49 +01006353
Victor Stinner358af132015-10-12 22:36:57 +02006354 p = _PyBytesWriter_Alloc(&writer, len);
6355 if (p == NULL)
6356 goto error;
6357 writer.overallocate = 1;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006358
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006359 for (pos = 0; pos < len; pos++) {
6360 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006361 /* Map 32-bit characters to '\Uxxxxxxxx' */
6362 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006363 assert(ch <= MAX_UNICODE);
Victor Stinner358af132015-10-12 22:36:57 +02006364
6365 /* -1: substract 1 preallocated byte */
6366 p = _PyBytesWriter_Prepare(&writer, p, 10-1);
6367 if (p == NULL)
6368 goto error;
6369
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006370 *p++ = '\\';
6371 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006372 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6373 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6374 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6375 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6376 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6377 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6378 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6379 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006380 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006382 else if (ch >= 256) {
Victor Stinner358af132015-10-12 22:36:57 +02006383 /* -1: substract 1 preallocated byte */
6384 p = _PyBytesWriter_Prepare(&writer, p, 6-1);
6385 if (p == NULL)
6386 goto error;
6387
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 *p++ = '\\';
6389 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006390 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6391 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6392 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6393 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006395 /* Copy everything else as-is */
6396 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397 *p++ = (char) ch;
6398 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006399
Victor Stinner358af132015-10-12 22:36:57 +02006400 return _PyBytesWriter_Finish(&writer, p);
6401
6402error:
6403 _PyBytesWriter_Dealloc(&writer);
6404 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405}
6406
Alexander Belopolsky40018472011-02-26 01:02:56 +00006407PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006408PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6409 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411 PyObject *result;
6412 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6413 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006414 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006415 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6416 Py_DECREF(tmp);
6417 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418}
6419
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006420/* --- Unicode Internal Codec ------------------------------------------- */
6421
Alexander Belopolsky40018472011-02-26 01:02:56 +00006422PyObject *
6423_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006424 Py_ssize_t size,
6425 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006426{
6427 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006428 Py_ssize_t startinpos;
6429 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006430 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006431 const char *end;
6432 const char *reason;
6433 PyObject *errorHandler = NULL;
6434 PyObject *exc = NULL;
6435
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006436 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006437 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006438 1))
6439 return NULL;
6440
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006441 if (size == 0)
6442 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006443
Victor Stinner8f674cc2013-04-17 23:02:17 +02006444 _PyUnicodeWriter_Init(&writer);
6445 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6446 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006448 }
6449 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006450
Victor Stinner8f674cc2013-04-17 23:02:17 +02006451 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006452 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006453 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006454 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006455 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006456 endinpos = end-starts;
6457 reason = "truncated input";
6458 goto error;
6459 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006460 /* We copy the raw representation one byte at a time because the
6461 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006462 ((char *) &uch)[0] = s[0];
6463 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006464#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006465 ((char *) &uch)[2] = s[2];
6466 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006467#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006468 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006469#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006470 /* We have to sanity check the raw data, otherwise doom looms for
6471 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006472 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006473 endinpos = s - starts + Py_UNICODE_SIZE;
6474 reason = "illegal code point (> 0x10FFFF)";
6475 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006476 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006477#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006478 s += Py_UNICODE_SIZE;
6479#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006480 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006481 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006482 Py_UNICODE uch2;
6483 ((char *) &uch2)[0] = s[0];
6484 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006485 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006486 {
Victor Stinner551ac952011-11-29 22:58:13 +01006487 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006488 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006489 }
6490 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006491#endif
6492
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006493 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006494 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006495 continue;
6496
6497 error:
6498 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006499 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006500 errors, &errorHandler,
6501 "unicode_internal", reason,
6502 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006503 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006504 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006505 }
6506
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006507 Py_XDECREF(errorHandler);
6508 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006509 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006510
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006512 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006513 Py_XDECREF(errorHandler);
6514 Py_XDECREF(exc);
6515 return NULL;
6516}
6517
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518/* --- Latin-1 Codec ------------------------------------------------------ */
6519
Alexander Belopolsky40018472011-02-26 01:02:56 +00006520PyObject *
6521PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006522 Py_ssize_t size,
6523 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006526 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527}
6528
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006529/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006530static void
6531make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006532 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006533 PyObject *unicode,
6534 Py_ssize_t startpos, Py_ssize_t endpos,
6535 const char *reason)
6536{
6537 if (*exceptionObject == NULL) {
6538 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006539 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006540 encoding, unicode, startpos, endpos, reason);
6541 }
6542 else {
6543 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6544 goto onError;
6545 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6546 goto onError;
6547 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6548 goto onError;
6549 return;
6550 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006551 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006552 }
6553}
6554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006555/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006556static void
6557raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006558 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006559 PyObject *unicode,
6560 Py_ssize_t startpos, Py_ssize_t endpos,
6561 const char *reason)
6562{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006563 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006564 encoding, unicode, startpos, endpos, reason);
6565 if (*exceptionObject != NULL)
6566 PyCodec_StrictErrors(*exceptionObject);
6567}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006568
6569/* error handling callback helper:
6570 build arguments, call the callback and check the arguments,
6571 put the result into newpos and return the replacement string, which
6572 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006573static PyObject *
6574unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006575 PyObject **errorHandler,
6576 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006577 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006578 Py_ssize_t startpos, Py_ssize_t endpos,
6579 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006580{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006581 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006582 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006583 PyObject *restuple;
6584 PyObject *resunicode;
6585
6586 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006588 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006590 }
6591
Benjamin Petersonbac79492012-01-14 13:34:47 -05006592 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006593 return NULL;
6594 len = PyUnicode_GET_LENGTH(unicode);
6595
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006596 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006597 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006598 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006599 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006600
6601 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006603 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006604 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006605 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006606 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 Py_DECREF(restuple);
6608 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006609 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006610 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 &resunicode, newpos)) {
6612 Py_DECREF(restuple);
6613 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006614 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006615 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6616 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6617 Py_DECREF(restuple);
6618 return NULL;
6619 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006620 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006621 *newpos = len + *newpos;
6622 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006623 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 Py_DECREF(restuple);
6625 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006626 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006627 Py_INCREF(resunicode);
6628 Py_DECREF(restuple);
6629 return resunicode;
6630}
6631
Alexander Belopolsky40018472011-02-26 01:02:56 +00006632static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006633unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006634 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006635 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006637 /* input state */
6638 Py_ssize_t pos=0, size;
6639 int kind;
6640 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641 /* pointer into the output */
6642 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006643 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6644 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006645 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006646 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006647 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006648 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006649 /* output object */
6650 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651
Benjamin Petersonbac79492012-01-14 13:34:47 -05006652 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006653 return NULL;
6654 size = PyUnicode_GET_LENGTH(unicode);
6655 kind = PyUnicode_KIND(unicode);
6656 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006657 /* allocate enough for a simple encoding without
6658 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006659 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006660 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006661
6662 _PyBytesWriter_Init(&writer);
6663 str = _PyBytesWriter_Alloc(&writer, size);
6664 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006665 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006666
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006667 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006668 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006671 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006673 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006675 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006676 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006677 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006679 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006680 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006681 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006682
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006683 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006685
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006686 /* Only overallocate the buffer if it's not the last write */
6687 writer.overallocate = (collend < size);
6688
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006690 if (error_handler == _Py_ERROR_UNKNOWN)
6691 error_handler = get_error_handler(errors);
6692
6693 switch (error_handler) {
6694 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006695 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006697
6698 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006699 memset(str, '?', collend - collstart);
6700 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006701 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006702 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006703 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006704 break;
Victor Stinner50149202015-09-22 00:26:54 +02006705
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006706 case _Py_ERROR_BACKSLASHREPLACE:
Victor Stinnerad771582015-10-09 12:38:53 +02006707 /* substract preallocated bytes */
6708 writer.min_size -= (collend - collstart);
6709 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006710 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006711 if (str == NULL)
6712 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006713 pos = collend;
6714 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006715
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006716 case _Py_ERROR_XMLCHARREFREPLACE:
Victor Stinnerad771582015-10-09 12:38:53 +02006717 /* substract preallocated bytes */
6718 writer.min_size -= (collend - collstart);
6719 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006720 unicode, collstart, collend);
6721 if (str == NULL)
6722 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006723 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 break;
Victor Stinner50149202015-09-22 00:26:54 +02006725
Victor Stinnerc3713e92015-09-29 12:32:13 +02006726 case _Py_ERROR_SURROGATEESCAPE:
6727 for (i = collstart; i < collend; ++i) {
6728 ch = PyUnicode_READ(kind, data, i);
6729 if (ch < 0xdc80 || 0xdcff < ch) {
6730 /* Not a UTF-8b surrogate */
6731 break;
6732 }
6733 *str++ = (char)(ch - 0xdc00);
6734 ++pos;
6735 }
6736 if (i >= collend)
6737 break;
6738 collstart = pos;
6739 assert(collstart != collend);
6740 /* fallback to general error handling */
6741
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006743 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6744 encoding, reason, unicode, &exc,
6745 collstart, collend, &newpos);
6746 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006747 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006748
Victor Stinnerad771582015-10-09 12:38:53 +02006749 /* substract preallocated bytes */
6750 writer.min_size -= 1;
6751
Victor Stinner6bd525b2015-10-09 13:10:05 +02006752 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006753 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006754 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006755 PyBytes_AS_STRING(rep),
6756 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006757 if (str == NULL)
6758 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006759 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006760 else {
6761 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006762
Victor Stinner6bd525b2015-10-09 13:10:05 +02006763 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006765
6766 if (PyUnicode_IS_ASCII(rep)) {
6767 /* Fast path: all characters are smaller than limit */
6768 assert(limit >= 128);
6769 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6770 str = _PyBytesWriter_WriteBytes(&writer, str,
6771 PyUnicode_DATA(rep),
6772 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006774 else {
6775 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6776
6777 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6778 if (str == NULL)
6779 goto onError;
6780
6781 /* check if there is anything unencodable in the
6782 replacement and copy it to the output */
6783 for (i = 0; repsize-->0; ++i, ++str) {
6784 ch = PyUnicode_READ_CHAR(rep, i);
6785 if (ch >= limit) {
6786 raise_encode_exception(&exc, encoding, unicode,
6787 pos, pos+1, reason);
6788 goto onError;
6789 }
6790 *str = (char)ch;
6791 }
6792 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006793 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006794 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006795 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006796 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006797
6798 /* If overallocation was disabled, ensure that it was the last
6799 write. Otherwise, we missed an optimization */
6800 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006801 }
6802 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006803
Victor Stinner50149202015-09-22 00:26:54 +02006804 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006805 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006806 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006807
6808 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006809 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006810 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006811 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006812 Py_XDECREF(exc);
6813 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006814}
6815
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006816/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006817PyObject *
6818PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006819 Py_ssize_t size,
6820 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006822 PyObject *result;
6823 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6824 if (unicode == NULL)
6825 return NULL;
6826 result = unicode_encode_ucs1(unicode, errors, 256);
6827 Py_DECREF(unicode);
6828 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829}
6830
Alexander Belopolsky40018472011-02-26 01:02:56 +00006831PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006832_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833{
6834 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 PyErr_BadArgument();
6836 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006838 if (PyUnicode_READY(unicode) == -1)
6839 return NULL;
6840 /* Fast path: if it is a one-byte string, construct
6841 bytes object directly. */
6842 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6843 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6844 PyUnicode_GET_LENGTH(unicode));
6845 /* Non-Latin-1 characters present. Defer to above function to
6846 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006847 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006848}
6849
6850PyObject*
6851PyUnicode_AsLatin1String(PyObject *unicode)
6852{
6853 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854}
6855
6856/* --- 7-bit ASCII Codec -------------------------------------------------- */
6857
Alexander Belopolsky40018472011-02-26 01:02:56 +00006858PyObject *
6859PyUnicode_DecodeASCII(const char *s,
6860 Py_ssize_t size,
6861 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006863 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006864 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006865 int kind;
6866 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006867 Py_ssize_t startinpos;
6868 Py_ssize_t endinpos;
6869 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006870 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006871 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006872 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006873 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006874
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006876 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006877
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006879 if (size == 1 && (unsigned char)s[0] < 128)
6880 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006881
Victor Stinner8f674cc2013-04-17 23:02:17 +02006882 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006883 writer.min_length = size;
6884 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006885 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006886
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006887 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006888 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006889 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006890 writer.pos = outpos;
6891 if (writer.pos == size)
6892 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006893
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006894 s += writer.pos;
6895 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006896 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006897 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006898 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006899 PyUnicode_WRITE(kind, data, writer.pos, c);
6900 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006902 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006904
6905 /* byte outsize range 0x00..0x7f: call the error handler */
6906
6907 if (error_handler == _Py_ERROR_UNKNOWN)
6908 error_handler = get_error_handler(errors);
6909
6910 switch (error_handler)
6911 {
6912 case _Py_ERROR_REPLACE:
6913 case _Py_ERROR_SURROGATEESCAPE:
6914 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006915 but we may switch to UCS2 at the first write */
6916 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6917 goto onError;
6918 kind = writer.kind;
6919 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006920
6921 if (error_handler == _Py_ERROR_REPLACE)
6922 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6923 else
6924 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6925 writer.pos++;
6926 ++s;
6927 break;
6928
6929 case _Py_ERROR_IGNORE:
6930 ++s;
6931 break;
6932
6933 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 startinpos = s-starts;
6935 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006936 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006937 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 "ascii", "ordinal not in range(128)",
6939 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006940 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006941 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006942 kind = writer.kind;
6943 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006944 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006946 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006947 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006948 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006949
Benjamin Peterson29060642009-01-31 22:14:21 +00006950 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006951 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006952 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006953 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 return NULL;
6955}
6956
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006957/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006958PyObject *
6959PyUnicode_EncodeASCII(const Py_UNICODE *p,
6960 Py_ssize_t size,
6961 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006963 PyObject *result;
6964 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6965 if (unicode == NULL)
6966 return NULL;
6967 result = unicode_encode_ucs1(unicode, errors, 128);
6968 Py_DECREF(unicode);
6969 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970}
6971
Alexander Belopolsky40018472011-02-26 01:02:56 +00006972PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006973_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974{
6975 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006976 PyErr_BadArgument();
6977 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006979 if (PyUnicode_READY(unicode) == -1)
6980 return NULL;
6981 /* Fast path: if it is an ASCII-only string, construct bytes object
6982 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006983 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006984 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6985 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006986 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006987}
6988
6989PyObject *
6990PyUnicode_AsASCIIString(PyObject *unicode)
6991{
6992 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993}
6994
Victor Stinner99b95382011-07-04 14:23:54 +02006995#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006996
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006997/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006998
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006999#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007000#define NEED_RETRY
7001#endif
7002
Victor Stinner3a50e702011-10-18 21:21:00 +02007003#ifndef WC_ERR_INVALID_CHARS
7004# define WC_ERR_INVALID_CHARS 0x0080
7005#endif
7006
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007007static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007008code_page_name(UINT code_page, PyObject **obj)
7009{
7010 *obj = NULL;
7011 if (code_page == CP_ACP)
7012 return "mbcs";
7013 if (code_page == CP_UTF7)
7014 return "CP_UTF7";
7015 if (code_page == CP_UTF8)
7016 return "CP_UTF8";
7017
7018 *obj = PyBytes_FromFormat("cp%u", code_page);
7019 if (*obj == NULL)
7020 return NULL;
7021 return PyBytes_AS_STRING(*obj);
7022}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007023
Victor Stinner3a50e702011-10-18 21:21:00 +02007024static DWORD
7025decode_code_page_flags(UINT code_page)
7026{
7027 if (code_page == CP_UTF7) {
7028 /* The CP_UTF7 decoder only supports flags=0 */
7029 return 0;
7030 }
7031 else
7032 return MB_ERR_INVALID_CHARS;
7033}
7034
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007035/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007036 * Decode a byte string from a Windows code page into unicode object in strict
7037 * mode.
7038 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007039 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7040 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007041 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007042static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007043decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007044 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007045 const char *in,
7046 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007047{
Victor Stinner3a50e702011-10-18 21:21:00 +02007048 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007049 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007050 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007051
7052 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007053 assert(insize > 0);
7054 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7055 if (outsize <= 0)
7056 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007057
7058 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007060 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007061 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 if (*v == NULL)
7063 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007064 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007065 }
7066 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007067 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007068 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007069 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007070 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007071 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007072 }
7073
7074 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007075 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7076 if (outsize <= 0)
7077 goto error;
7078 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007079
Victor Stinner3a50e702011-10-18 21:21:00 +02007080error:
7081 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7082 return -2;
7083 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007084 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007085}
7086
Victor Stinner3a50e702011-10-18 21:21:00 +02007087/*
7088 * Decode a byte string from a code page into unicode object with an error
7089 * handler.
7090 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007091 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007092 * UnicodeDecodeError exception and returns -1 on error.
7093 */
7094static int
7095decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007096 PyObject **v,
7097 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007098 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007099{
7100 const char *startin = in;
7101 const char *endin = in + size;
7102 const DWORD flags = decode_code_page_flags(code_page);
7103 /* Ideally, we should get reason from FormatMessage. This is the Windows
7104 2000 English version of the message. */
7105 const char *reason = "No mapping for the Unicode character exists "
7106 "in the target code page.";
7107 /* each step cannot decode more than 1 character, but a character can be
7108 represented as a surrogate pair */
7109 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007110 int insize;
7111 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007112 PyObject *errorHandler = NULL;
7113 PyObject *exc = NULL;
7114 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007115 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007116 DWORD err;
7117 int ret = -1;
7118
7119 assert(size > 0);
7120
7121 encoding = code_page_name(code_page, &encoding_obj);
7122 if (encoding == NULL)
7123 return -1;
7124
Victor Stinner7d00cc12014-03-17 23:08:06 +01007125 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007126 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7127 UnicodeDecodeError. */
7128 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7129 if (exc != NULL) {
7130 PyCodec_StrictErrors(exc);
7131 Py_CLEAR(exc);
7132 }
7133 goto error;
7134 }
7135
7136 if (*v == NULL) {
7137 /* Create unicode object */
7138 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7139 PyErr_NoMemory();
7140 goto error;
7141 }
Victor Stinnerab595942011-12-17 04:59:06 +01007142 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007143 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007144 if (*v == NULL)
7145 goto error;
7146 startout = PyUnicode_AS_UNICODE(*v);
7147 }
7148 else {
7149 /* Extend unicode object */
7150 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7151 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7152 PyErr_NoMemory();
7153 goto error;
7154 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007155 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007156 goto error;
7157 startout = PyUnicode_AS_UNICODE(*v) + n;
7158 }
7159
7160 /* Decode the byte string character per character */
7161 out = startout;
7162 while (in < endin)
7163 {
7164 /* Decode a character */
7165 insize = 1;
7166 do
7167 {
7168 outsize = MultiByteToWideChar(code_page, flags,
7169 in, insize,
7170 buffer, Py_ARRAY_LENGTH(buffer));
7171 if (outsize > 0)
7172 break;
7173 err = GetLastError();
7174 if (err != ERROR_NO_UNICODE_TRANSLATION
7175 && err != ERROR_INSUFFICIENT_BUFFER)
7176 {
7177 PyErr_SetFromWindowsErr(0);
7178 goto error;
7179 }
7180 insize++;
7181 }
7182 /* 4=maximum length of a UTF-8 sequence */
7183 while (insize <= 4 && (in + insize) <= endin);
7184
7185 if (outsize <= 0) {
7186 Py_ssize_t startinpos, endinpos, outpos;
7187
Victor Stinner7d00cc12014-03-17 23:08:06 +01007188 /* last character in partial decode? */
7189 if (in + insize >= endin && !final)
7190 break;
7191
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 startinpos = in - startin;
7193 endinpos = startinpos + 1;
7194 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007195 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007196 errors, &errorHandler,
7197 encoding, reason,
7198 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007199 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007200 {
7201 goto error;
7202 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007203 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 }
7205 else {
7206 in += insize;
7207 memcpy(out, buffer, outsize * sizeof(wchar_t));
7208 out += outsize;
7209 }
7210 }
7211
7212 /* write a NUL character at the end */
7213 *out = 0;
7214
7215 /* Extend unicode object */
7216 outsize = out - startout;
7217 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007218 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007219 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007220 /* (in - startin) <= size and size is an int */
7221 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007222
7223error:
7224 Py_XDECREF(encoding_obj);
7225 Py_XDECREF(errorHandler);
7226 Py_XDECREF(exc);
7227 return ret;
7228}
7229
Victor Stinner3a50e702011-10-18 21:21:00 +02007230static PyObject *
7231decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007232 const char *s, Py_ssize_t size,
7233 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007234{
Victor Stinner76a31a62011-11-04 00:05:13 +01007235 PyObject *v = NULL;
7236 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007237
Victor Stinner3a50e702011-10-18 21:21:00 +02007238 if (code_page < 0) {
7239 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7240 return NULL;
7241 }
7242
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007243 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007244 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007245
Victor Stinner76a31a62011-11-04 00:05:13 +01007246 do
7247 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007248#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007249 if (size > INT_MAX) {
7250 chunk_size = INT_MAX;
7251 final = 0;
7252 done = 0;
7253 }
7254 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007255#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007256 {
7257 chunk_size = (int)size;
7258 final = (consumed == NULL);
7259 done = 1;
7260 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007261
Victor Stinner76a31a62011-11-04 00:05:13 +01007262 if (chunk_size == 0 && done) {
7263 if (v != NULL)
7264 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007265 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007266 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007267
Victor Stinner76a31a62011-11-04 00:05:13 +01007268 converted = decode_code_page_strict(code_page, &v,
7269 s, chunk_size);
7270 if (converted == -2)
7271 converted = decode_code_page_errors(code_page, &v,
7272 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007273 errors, final);
7274 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007275
7276 if (converted < 0) {
7277 Py_XDECREF(v);
7278 return NULL;
7279 }
7280
7281 if (consumed)
7282 *consumed += converted;
7283
7284 s += converted;
7285 size -= converted;
7286 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007287
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007288 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007289}
7290
Alexander Belopolsky40018472011-02-26 01:02:56 +00007291PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007292PyUnicode_DecodeCodePageStateful(int code_page,
7293 const char *s,
7294 Py_ssize_t size,
7295 const char *errors,
7296 Py_ssize_t *consumed)
7297{
7298 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7299}
7300
7301PyObject *
7302PyUnicode_DecodeMBCSStateful(const char *s,
7303 Py_ssize_t size,
7304 const char *errors,
7305 Py_ssize_t *consumed)
7306{
7307 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7308}
7309
7310PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007311PyUnicode_DecodeMBCS(const char *s,
7312 Py_ssize_t size,
7313 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007314{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007315 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7316}
7317
Victor Stinner3a50e702011-10-18 21:21:00 +02007318static DWORD
7319encode_code_page_flags(UINT code_page, const char *errors)
7320{
7321 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007322 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007323 }
7324 else if (code_page == CP_UTF7) {
7325 /* CP_UTF7 only supports flags=0 */
7326 return 0;
7327 }
7328 else {
7329 if (errors != NULL && strcmp(errors, "replace") == 0)
7330 return 0;
7331 else
7332 return WC_NO_BEST_FIT_CHARS;
7333 }
7334}
7335
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007336/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007337 * Encode a Unicode string to a Windows code page into a byte string in strict
7338 * mode.
7339 *
7340 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007341 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007342 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007343static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007344encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007345 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007346 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007347{
Victor Stinner554f3f02010-06-16 23:33:54 +00007348 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007349 BOOL *pusedDefaultChar = &usedDefaultChar;
7350 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007351 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007352 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007353 const DWORD flags = encode_code_page_flags(code_page, NULL);
7354 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007355 /* Create a substring so that we can get the UTF-16 representation
7356 of just the slice under consideration. */
7357 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007358
Martin v. Löwis3d325192011-11-04 18:23:06 +01007359 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007360
Victor Stinner3a50e702011-10-18 21:21:00 +02007361 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007362 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007363 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007364 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007365
Victor Stinner2fc507f2011-11-04 20:06:39 +01007366 substring = PyUnicode_Substring(unicode, offset, offset+len);
7367 if (substring == NULL)
7368 return -1;
7369 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7370 if (p == NULL) {
7371 Py_DECREF(substring);
7372 return -1;
7373 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007374 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007375
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007376 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007377 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007378 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007379 NULL, 0,
7380 NULL, pusedDefaultChar);
7381 if (outsize <= 0)
7382 goto error;
7383 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007384 if (pusedDefaultChar && *pusedDefaultChar) {
7385 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007386 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007387 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007388
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007391 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007392 if (*outbytes == NULL) {
7393 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007394 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007395 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007397 }
7398 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007399 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007400 const Py_ssize_t n = PyBytes_Size(*outbytes);
7401 if (outsize > PY_SSIZE_T_MAX - n) {
7402 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007403 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007404 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007405 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007406 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7407 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007408 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007409 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007410 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007411 }
7412
7413 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007414 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007415 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007416 out, outsize,
7417 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007418 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007419 if (outsize <= 0)
7420 goto error;
7421 if (pusedDefaultChar && *pusedDefaultChar)
7422 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007423 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007424
Victor Stinner3a50e702011-10-18 21:21:00 +02007425error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007426 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7428 return -2;
7429 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007430 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007431}
7432
Victor Stinner3a50e702011-10-18 21:21:00 +02007433/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007434 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007435 * error handler.
7436 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007437 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 * -1 on other error.
7439 */
7440static int
7441encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007442 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007443 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007444{
Victor Stinner3a50e702011-10-18 21:21:00 +02007445 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007446 Py_ssize_t pos = unicode_offset;
7447 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007448 /* Ideally, we should get reason from FormatMessage. This is the Windows
7449 2000 English version of the message. */
7450 const char *reason = "invalid character";
7451 /* 4=maximum length of a UTF-8 sequence */
7452 char buffer[4];
7453 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7454 Py_ssize_t outsize;
7455 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007456 PyObject *errorHandler = NULL;
7457 PyObject *exc = NULL;
7458 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007459 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007460 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 PyObject *rep;
7462 int ret = -1;
7463
7464 assert(insize > 0);
7465
7466 encoding = code_page_name(code_page, &encoding_obj);
7467 if (encoding == NULL)
7468 return -1;
7469
7470 if (errors == NULL || strcmp(errors, "strict") == 0) {
7471 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7472 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007473 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007474 if (exc != NULL) {
7475 PyCodec_StrictErrors(exc);
7476 Py_DECREF(exc);
7477 }
7478 Py_XDECREF(encoding_obj);
7479 return -1;
7480 }
7481
7482 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7483 pusedDefaultChar = &usedDefaultChar;
7484 else
7485 pusedDefaultChar = NULL;
7486
7487 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7488 PyErr_NoMemory();
7489 goto error;
7490 }
7491 outsize = insize * Py_ARRAY_LENGTH(buffer);
7492
7493 if (*outbytes == NULL) {
7494 /* Create string object */
7495 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7496 if (*outbytes == NULL)
7497 goto error;
7498 out = PyBytes_AS_STRING(*outbytes);
7499 }
7500 else {
7501 /* Extend string object */
7502 Py_ssize_t n = PyBytes_Size(*outbytes);
7503 if (n > PY_SSIZE_T_MAX - outsize) {
7504 PyErr_NoMemory();
7505 goto error;
7506 }
7507 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7508 goto error;
7509 out = PyBytes_AS_STRING(*outbytes) + n;
7510 }
7511
7512 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007513 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007514 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007515 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7516 wchar_t chars[2];
7517 int charsize;
7518 if (ch < 0x10000) {
7519 chars[0] = (wchar_t)ch;
7520 charsize = 1;
7521 }
7522 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007523 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7524 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007525 charsize = 2;
7526 }
7527
Victor Stinner3a50e702011-10-18 21:21:00 +02007528 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007529 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007530 buffer, Py_ARRAY_LENGTH(buffer),
7531 NULL, pusedDefaultChar);
7532 if (outsize > 0) {
7533 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7534 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007535 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007536 memcpy(out, buffer, outsize);
7537 out += outsize;
7538 continue;
7539 }
7540 }
7541 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7542 PyErr_SetFromWindowsErr(0);
7543 goto error;
7544 }
7545
Victor Stinner3a50e702011-10-18 21:21:00 +02007546 rep = unicode_encode_call_errorhandler(
7547 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007548 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007549 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007550 if (rep == NULL)
7551 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007552 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007553
7554 if (PyBytes_Check(rep)) {
7555 outsize = PyBytes_GET_SIZE(rep);
7556 if (outsize != 1) {
7557 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7558 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7559 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7560 Py_DECREF(rep);
7561 goto error;
7562 }
7563 out = PyBytes_AS_STRING(*outbytes) + offset;
7564 }
7565 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7566 out += outsize;
7567 }
7568 else {
7569 Py_ssize_t i;
7570 enum PyUnicode_Kind kind;
7571 void *data;
7572
Benjamin Petersonbac79492012-01-14 13:34:47 -05007573 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007574 Py_DECREF(rep);
7575 goto error;
7576 }
7577
7578 outsize = PyUnicode_GET_LENGTH(rep);
7579 if (outsize != 1) {
7580 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7581 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7582 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7583 Py_DECREF(rep);
7584 goto error;
7585 }
7586 out = PyBytes_AS_STRING(*outbytes) + offset;
7587 }
7588 kind = PyUnicode_KIND(rep);
7589 data = PyUnicode_DATA(rep);
7590 for (i=0; i < outsize; i++) {
7591 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7592 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007593 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007594 encoding, unicode,
7595 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007596 "unable to encode error handler result to ASCII");
7597 Py_DECREF(rep);
7598 goto error;
7599 }
7600 *out = (unsigned char)ch;
7601 out++;
7602 }
7603 }
7604 Py_DECREF(rep);
7605 }
7606 /* write a NUL byte */
7607 *out = 0;
7608 outsize = out - PyBytes_AS_STRING(*outbytes);
7609 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7610 if (_PyBytes_Resize(outbytes, outsize) < 0)
7611 goto error;
7612 ret = 0;
7613
7614error:
7615 Py_XDECREF(encoding_obj);
7616 Py_XDECREF(errorHandler);
7617 Py_XDECREF(exc);
7618 return ret;
7619}
7620
Victor Stinner3a50e702011-10-18 21:21:00 +02007621static PyObject *
7622encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007623 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007624 const char *errors)
7625{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007626 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007627 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007628 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007629 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007630
Victor Stinner29dacf22015-01-26 16:41:32 +01007631 if (!PyUnicode_Check(unicode)) {
7632 PyErr_BadArgument();
7633 return NULL;
7634 }
7635
Benjamin Petersonbac79492012-01-14 13:34:47 -05007636 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007637 return NULL;
7638 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007639
Victor Stinner3a50e702011-10-18 21:21:00 +02007640 if (code_page < 0) {
7641 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7642 return NULL;
7643 }
7644
Martin v. Löwis3d325192011-11-04 18:23:06 +01007645 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007646 return PyBytes_FromStringAndSize(NULL, 0);
7647
Victor Stinner7581cef2011-11-03 22:32:33 +01007648 offset = 0;
7649 do
7650 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007651#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007652 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007653 chunks. */
7654 if (len > INT_MAX/2) {
7655 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007656 done = 0;
7657 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007658 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007659#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007660 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007661 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007662 done = 1;
7663 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007664
Victor Stinner76a31a62011-11-04 00:05:13 +01007665 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007666 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007667 errors);
7668 if (ret == -2)
7669 ret = encode_code_page_errors(code_page, &outbytes,
7670 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007671 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007672 if (ret < 0) {
7673 Py_XDECREF(outbytes);
7674 return NULL;
7675 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007676
Victor Stinner7581cef2011-11-03 22:32:33 +01007677 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007678 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007679 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007680
Victor Stinner3a50e702011-10-18 21:21:00 +02007681 return outbytes;
7682}
7683
7684PyObject *
7685PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7686 Py_ssize_t size,
7687 const char *errors)
7688{
Victor Stinner7581cef2011-11-03 22:32:33 +01007689 PyObject *unicode, *res;
7690 unicode = PyUnicode_FromUnicode(p, size);
7691 if (unicode == NULL)
7692 return NULL;
7693 res = encode_code_page(CP_ACP, unicode, errors);
7694 Py_DECREF(unicode);
7695 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007696}
7697
7698PyObject *
7699PyUnicode_EncodeCodePage(int code_page,
7700 PyObject *unicode,
7701 const char *errors)
7702{
Victor Stinner7581cef2011-11-03 22:32:33 +01007703 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007704}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007705
Alexander Belopolsky40018472011-02-26 01:02:56 +00007706PyObject *
7707PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007708{
Victor Stinner7581cef2011-11-03 22:32:33 +01007709 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007710}
7711
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007712#undef NEED_RETRY
7713
Victor Stinner99b95382011-07-04 14:23:54 +02007714#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007715
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716/* --- Character Mapping Codec -------------------------------------------- */
7717
Victor Stinnerfb161b12013-04-18 01:44:27 +02007718static int
7719charmap_decode_string(const char *s,
7720 Py_ssize_t size,
7721 PyObject *mapping,
7722 const char *errors,
7723 _PyUnicodeWriter *writer)
7724{
7725 const char *starts = s;
7726 const char *e;
7727 Py_ssize_t startinpos, endinpos;
7728 PyObject *errorHandler = NULL, *exc = NULL;
7729 Py_ssize_t maplen;
7730 enum PyUnicode_Kind mapkind;
7731 void *mapdata;
7732 Py_UCS4 x;
7733 unsigned char ch;
7734
7735 if (PyUnicode_READY(mapping) == -1)
7736 return -1;
7737
7738 maplen = PyUnicode_GET_LENGTH(mapping);
7739 mapdata = PyUnicode_DATA(mapping);
7740 mapkind = PyUnicode_KIND(mapping);
7741
7742 e = s + size;
7743
7744 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7745 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7746 * is disabled in encoding aliases, latin1 is preferred because
7747 * its implementation is faster. */
7748 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7749 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7750 Py_UCS4 maxchar = writer->maxchar;
7751
7752 assert (writer->kind == PyUnicode_1BYTE_KIND);
7753 while (s < e) {
7754 ch = *s;
7755 x = mapdata_ucs1[ch];
7756 if (x > maxchar) {
7757 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7758 goto onError;
7759 maxchar = writer->maxchar;
7760 outdata = (Py_UCS1 *)writer->data;
7761 }
7762 outdata[writer->pos] = x;
7763 writer->pos++;
7764 ++s;
7765 }
7766 return 0;
7767 }
7768
7769 while (s < e) {
7770 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7771 enum PyUnicode_Kind outkind = writer->kind;
7772 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7773 if (outkind == PyUnicode_1BYTE_KIND) {
7774 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7775 Py_UCS4 maxchar = writer->maxchar;
7776 while (s < e) {
7777 ch = *s;
7778 x = mapdata_ucs2[ch];
7779 if (x > maxchar)
7780 goto Error;
7781 outdata[writer->pos] = x;
7782 writer->pos++;
7783 ++s;
7784 }
7785 break;
7786 }
7787 else if (outkind == PyUnicode_2BYTE_KIND) {
7788 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7789 while (s < e) {
7790 ch = *s;
7791 x = mapdata_ucs2[ch];
7792 if (x == 0xFFFE)
7793 goto Error;
7794 outdata[writer->pos] = x;
7795 writer->pos++;
7796 ++s;
7797 }
7798 break;
7799 }
7800 }
7801 ch = *s;
7802
7803 if (ch < maplen)
7804 x = PyUnicode_READ(mapkind, mapdata, ch);
7805 else
7806 x = 0xfffe; /* invalid value */
7807Error:
7808 if (x == 0xfffe)
7809 {
7810 /* undefined mapping */
7811 startinpos = s-starts;
7812 endinpos = startinpos+1;
7813 if (unicode_decode_call_errorhandler_writer(
7814 errors, &errorHandler,
7815 "charmap", "character maps to <undefined>",
7816 &starts, &e, &startinpos, &endinpos, &exc, &s,
7817 writer)) {
7818 goto onError;
7819 }
7820 continue;
7821 }
7822
7823 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7824 goto onError;
7825 ++s;
7826 }
7827 Py_XDECREF(errorHandler);
7828 Py_XDECREF(exc);
7829 return 0;
7830
7831onError:
7832 Py_XDECREF(errorHandler);
7833 Py_XDECREF(exc);
7834 return -1;
7835}
7836
7837static int
7838charmap_decode_mapping(const char *s,
7839 Py_ssize_t size,
7840 PyObject *mapping,
7841 const char *errors,
7842 _PyUnicodeWriter *writer)
7843{
7844 const char *starts = s;
7845 const char *e;
7846 Py_ssize_t startinpos, endinpos;
7847 PyObject *errorHandler = NULL, *exc = NULL;
7848 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007849 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007850
7851 e = s + size;
7852
7853 while (s < e) {
7854 ch = *s;
7855
7856 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7857 key = PyLong_FromLong((long)ch);
7858 if (key == NULL)
7859 goto onError;
7860
7861 item = PyObject_GetItem(mapping, key);
7862 Py_DECREF(key);
7863 if (item == NULL) {
7864 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7865 /* No mapping found means: mapping is undefined. */
7866 PyErr_Clear();
7867 goto Undefined;
7868 } else
7869 goto onError;
7870 }
7871
7872 /* Apply mapping */
7873 if (item == Py_None)
7874 goto Undefined;
7875 if (PyLong_Check(item)) {
7876 long value = PyLong_AS_LONG(item);
7877 if (value == 0xFFFE)
7878 goto Undefined;
7879 if (value < 0 || value > MAX_UNICODE) {
7880 PyErr_Format(PyExc_TypeError,
7881 "character mapping must be in range(0x%lx)",
7882 (unsigned long)MAX_UNICODE + 1);
7883 goto onError;
7884 }
7885
7886 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7887 goto onError;
7888 }
7889 else if (PyUnicode_Check(item)) {
7890 if (PyUnicode_READY(item) == -1)
7891 goto onError;
7892 if (PyUnicode_GET_LENGTH(item) == 1) {
7893 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7894 if (value == 0xFFFE)
7895 goto Undefined;
7896 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7897 goto onError;
7898 }
7899 else {
7900 writer->overallocate = 1;
7901 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7902 goto onError;
7903 }
7904 }
7905 else {
7906 /* wrong return value */
7907 PyErr_SetString(PyExc_TypeError,
7908 "character mapping must return integer, None or str");
7909 goto onError;
7910 }
7911 Py_CLEAR(item);
7912 ++s;
7913 continue;
7914
7915Undefined:
7916 /* undefined mapping */
7917 Py_CLEAR(item);
7918 startinpos = s-starts;
7919 endinpos = startinpos+1;
7920 if (unicode_decode_call_errorhandler_writer(
7921 errors, &errorHandler,
7922 "charmap", "character maps to <undefined>",
7923 &starts, &e, &startinpos, &endinpos, &exc, &s,
7924 writer)) {
7925 goto onError;
7926 }
7927 }
7928 Py_XDECREF(errorHandler);
7929 Py_XDECREF(exc);
7930 return 0;
7931
7932onError:
7933 Py_XDECREF(item);
7934 Py_XDECREF(errorHandler);
7935 Py_XDECREF(exc);
7936 return -1;
7937}
7938
Alexander Belopolsky40018472011-02-26 01:02:56 +00007939PyObject *
7940PyUnicode_DecodeCharmap(const char *s,
7941 Py_ssize_t size,
7942 PyObject *mapping,
7943 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007945 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007946
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947 /* Default to Latin-1 */
7948 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007949 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007952 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007953 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007954 writer.min_length = size;
7955 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007957
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007958 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007959 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7960 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007961 }
7962 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007963 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7964 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007966 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007967
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007969 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970 return NULL;
7971}
7972
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007973/* Charmap encoding: the lookup table */
7974
Alexander Belopolsky40018472011-02-26 01:02:56 +00007975struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 PyObject_HEAD
7977 unsigned char level1[32];
7978 int count2, count3;
7979 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007980};
7981
7982static PyObject*
7983encoding_map_size(PyObject *obj, PyObject* args)
7984{
7985 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007986 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007988}
7989
7990static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007991 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 PyDoc_STR("Return the size (in bytes) of this object") },
7993 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007994};
7995
7996static void
7997encoding_map_dealloc(PyObject* o)
7998{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007999 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008000}
8001
8002static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008003 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 "EncodingMap", /*tp_name*/
8005 sizeof(struct encoding_map), /*tp_basicsize*/
8006 0, /*tp_itemsize*/
8007 /* methods */
8008 encoding_map_dealloc, /*tp_dealloc*/
8009 0, /*tp_print*/
8010 0, /*tp_getattr*/
8011 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008012 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 0, /*tp_repr*/
8014 0, /*tp_as_number*/
8015 0, /*tp_as_sequence*/
8016 0, /*tp_as_mapping*/
8017 0, /*tp_hash*/
8018 0, /*tp_call*/
8019 0, /*tp_str*/
8020 0, /*tp_getattro*/
8021 0, /*tp_setattro*/
8022 0, /*tp_as_buffer*/
8023 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8024 0, /*tp_doc*/
8025 0, /*tp_traverse*/
8026 0, /*tp_clear*/
8027 0, /*tp_richcompare*/
8028 0, /*tp_weaklistoffset*/
8029 0, /*tp_iter*/
8030 0, /*tp_iternext*/
8031 encoding_map_methods, /*tp_methods*/
8032 0, /*tp_members*/
8033 0, /*tp_getset*/
8034 0, /*tp_base*/
8035 0, /*tp_dict*/
8036 0, /*tp_descr_get*/
8037 0, /*tp_descr_set*/
8038 0, /*tp_dictoffset*/
8039 0, /*tp_init*/
8040 0, /*tp_alloc*/
8041 0, /*tp_new*/
8042 0, /*tp_free*/
8043 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008044};
8045
8046PyObject*
8047PyUnicode_BuildEncodingMap(PyObject* string)
8048{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008049 PyObject *result;
8050 struct encoding_map *mresult;
8051 int i;
8052 int need_dict = 0;
8053 unsigned char level1[32];
8054 unsigned char level2[512];
8055 unsigned char *mlevel1, *mlevel2, *mlevel3;
8056 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008057 int kind;
8058 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008059 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008060 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008061
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008062 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008063 PyErr_BadArgument();
8064 return NULL;
8065 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008066 kind = PyUnicode_KIND(string);
8067 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008068 length = PyUnicode_GET_LENGTH(string);
8069 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008070 memset(level1, 0xFF, sizeof level1);
8071 memset(level2, 0xFF, sizeof level2);
8072
8073 /* If there isn't a one-to-one mapping of NULL to \0,
8074 or if there are non-BMP characters, we need to use
8075 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008076 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008077 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008078 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008079 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008080 ch = PyUnicode_READ(kind, data, i);
8081 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008082 need_dict = 1;
8083 break;
8084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008085 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008086 /* unmapped character */
8087 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008088 l1 = ch >> 11;
8089 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008090 if (level1[l1] == 0xFF)
8091 level1[l1] = count2++;
8092 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008093 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008094 }
8095
8096 if (count2 >= 0xFF || count3 >= 0xFF)
8097 need_dict = 1;
8098
8099 if (need_dict) {
8100 PyObject *result = PyDict_New();
8101 PyObject *key, *value;
8102 if (!result)
8103 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008104 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008105 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008106 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008107 if (!key || !value)
8108 goto failed1;
8109 if (PyDict_SetItem(result, key, value) == -1)
8110 goto failed1;
8111 Py_DECREF(key);
8112 Py_DECREF(value);
8113 }
8114 return result;
8115 failed1:
8116 Py_XDECREF(key);
8117 Py_XDECREF(value);
8118 Py_DECREF(result);
8119 return NULL;
8120 }
8121
8122 /* Create a three-level trie */
8123 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8124 16*count2 + 128*count3 - 1);
8125 if (!result)
8126 return PyErr_NoMemory();
8127 PyObject_Init(result, &EncodingMapType);
8128 mresult = (struct encoding_map*)result;
8129 mresult->count2 = count2;
8130 mresult->count3 = count3;
8131 mlevel1 = mresult->level1;
8132 mlevel2 = mresult->level23;
8133 mlevel3 = mresult->level23 + 16*count2;
8134 memcpy(mlevel1, level1, 32);
8135 memset(mlevel2, 0xFF, 16*count2);
8136 memset(mlevel3, 0, 128*count3);
8137 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008138 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008139 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008140 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8141 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008142 /* unmapped character */
8143 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008144 o1 = ch>>11;
8145 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008146 i2 = 16*mlevel1[o1] + o2;
8147 if (mlevel2[i2] == 0xFF)
8148 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008149 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150 i3 = 128*mlevel2[i2] + o3;
8151 mlevel3[i3] = i;
8152 }
8153 return result;
8154}
8155
8156static int
Victor Stinner22168992011-11-20 17:09:18 +01008157encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008158{
8159 struct encoding_map *map = (struct encoding_map*)mapping;
8160 int l1 = c>>11;
8161 int l2 = (c>>7) & 0xF;
8162 int l3 = c & 0x7F;
8163 int i;
8164
Victor Stinner22168992011-11-20 17:09:18 +01008165 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008167 if (c == 0)
8168 return 0;
8169 /* level 1*/
8170 i = map->level1[l1];
8171 if (i == 0xFF) {
8172 return -1;
8173 }
8174 /* level 2*/
8175 i = map->level23[16*i+l2];
8176 if (i == 0xFF) {
8177 return -1;
8178 }
8179 /* level 3 */
8180 i = map->level23[16*map->count2 + 128*i + l3];
8181 if (i == 0) {
8182 return -1;
8183 }
8184 return i;
8185}
8186
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008187/* Lookup the character ch in the mapping. If the character
8188 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008189 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008190static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008191charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192{
Christian Heimes217cfd12007-12-02 14:31:20 +00008193 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008194 PyObject *x;
8195
8196 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198 x = PyObject_GetItem(mapping, w);
8199 Py_DECREF(w);
8200 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8202 /* No mapping found means: mapping is undefined. */
8203 PyErr_Clear();
8204 x = Py_None;
8205 Py_INCREF(x);
8206 return x;
8207 } else
8208 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008210 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008211 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008212 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008213 long value = PyLong_AS_LONG(x);
8214 if (value < 0 || value > 255) {
8215 PyErr_SetString(PyExc_TypeError,
8216 "character mapping must be in range(256)");
8217 Py_DECREF(x);
8218 return NULL;
8219 }
8220 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008222 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008223 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 /* wrong return value */
8226 PyErr_Format(PyExc_TypeError,
8227 "character mapping must return integer, bytes or None, not %.400s",
8228 x->ob_type->tp_name);
8229 Py_DECREF(x);
8230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231 }
8232}
8233
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008234static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008235charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008236{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008237 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8238 /* exponentially overallocate to minimize reallocations */
8239 if (requiredsize < 2*outsize)
8240 requiredsize = 2*outsize;
8241 if (_PyBytes_Resize(outobj, requiredsize))
8242 return -1;
8243 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008244}
8245
Benjamin Peterson14339b62009-01-31 16:36:08 +00008246typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008248} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008250 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251 space is available. Return a new reference to the object that
8252 was put in the output buffer, or Py_None, if the mapping was undefined
8253 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008254 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008255static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008256charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008257 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008259 PyObject *rep;
8260 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008261 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262
Christian Heimes90aa7642007-12-19 02:45:37 +00008263 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008264 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008265 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008266 if (res == -1)
8267 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 if (outsize<requiredsize)
8269 if (charmapencode_resize(outobj, outpos, requiredsize))
8270 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008271 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 outstart[(*outpos)++] = (char)res;
8273 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008274 }
8275
8276 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008277 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008279 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 Py_DECREF(rep);
8281 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008282 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 if (PyLong_Check(rep)) {
8284 Py_ssize_t requiredsize = *outpos+1;
8285 if (outsize<requiredsize)
8286 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8287 Py_DECREF(rep);
8288 return enc_EXCEPTION;
8289 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008290 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008292 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 else {
8294 const char *repchars = PyBytes_AS_STRING(rep);
8295 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8296 Py_ssize_t requiredsize = *outpos+repsize;
8297 if (outsize<requiredsize)
8298 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8299 Py_DECREF(rep);
8300 return enc_EXCEPTION;
8301 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008302 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 memcpy(outstart + *outpos, repchars, repsize);
8304 *outpos += repsize;
8305 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008307 Py_DECREF(rep);
8308 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008309}
8310
8311/* handle an error in PyUnicode_EncodeCharmap
8312 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008313static int
8314charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008315 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008316 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008317 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008318 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008319{
8320 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008321 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008322 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008323 enum PyUnicode_Kind kind;
8324 void *data;
8325 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008326 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008327 Py_ssize_t collstartpos = *inpos;
8328 Py_ssize_t collendpos = *inpos+1;
8329 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330 char *encoding = "charmap";
8331 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008332 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008333 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008334 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008335
Benjamin Petersonbac79492012-01-14 13:34:47 -05008336 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008337 return -1;
8338 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008339 /* find all unencodable characters */
8340 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008341 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008342 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008343 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008344 val = encoding_map_lookup(ch, mapping);
8345 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 break;
8347 ++collendpos;
8348 continue;
8349 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008350
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008351 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8352 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 if (rep==NULL)
8354 return -1;
8355 else if (rep!=Py_None) {
8356 Py_DECREF(rep);
8357 break;
8358 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008359 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008361 }
8362 /* cache callback name lookup
8363 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008364 if (*error_handler == _Py_ERROR_UNKNOWN)
8365 *error_handler = get_error_handler(errors);
8366
8367 switch (*error_handler) {
8368 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008369 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008370 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008371
8372 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008373 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 x = charmapencode_output('?', mapping, res, respos);
8375 if (x==enc_EXCEPTION) {
8376 return -1;
8377 }
8378 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008379 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 return -1;
8381 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008382 }
8383 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008384 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008385 *inpos = collendpos;
8386 break;
Victor Stinner50149202015-09-22 00:26:54 +02008387
8388 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008389 /* generate replacement (temporarily (mis)uses p) */
8390 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 char buffer[2+29+1+1];
8392 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008393 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 for (cp = buffer; *cp; ++cp) {
8395 x = charmapencode_output(*cp, mapping, res, respos);
8396 if (x==enc_EXCEPTION)
8397 return -1;
8398 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008399 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 return -1;
8401 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008402 }
8403 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008404 *inpos = collendpos;
8405 break;
Victor Stinner50149202015-09-22 00:26:54 +02008406
Benjamin Peterson14339b62009-01-31 16:36:08 +00008407 default:
Victor Stinner50149202015-09-22 00:26:54 +02008408 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008409 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008411 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008413 if (PyBytes_Check(repunicode)) {
8414 /* Directly copy bytes result to output. */
8415 Py_ssize_t outsize = PyBytes_Size(*res);
8416 Py_ssize_t requiredsize;
8417 repsize = PyBytes_Size(repunicode);
8418 requiredsize = *respos + repsize;
8419 if (requiredsize > outsize)
8420 /* Make room for all additional bytes. */
8421 if (charmapencode_resize(res, respos, requiredsize)) {
8422 Py_DECREF(repunicode);
8423 return -1;
8424 }
8425 memcpy(PyBytes_AsString(*res) + *respos,
8426 PyBytes_AsString(repunicode), repsize);
8427 *respos += repsize;
8428 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008429 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008430 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008431 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008433 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008434 Py_DECREF(repunicode);
8435 return -1;
8436 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008437 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008438 data = PyUnicode_DATA(repunicode);
8439 kind = PyUnicode_KIND(repunicode);
8440 for (index = 0; index < repsize; index++) {
8441 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8442 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008444 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 return -1;
8446 }
8447 else if (x==enc_FAILED) {
8448 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008449 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 return -1;
8451 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008452 }
8453 *inpos = newpos;
8454 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008455 }
8456 return 0;
8457}
8458
Alexander Belopolsky40018472011-02-26 01:02:56 +00008459PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008460_PyUnicode_EncodeCharmap(PyObject *unicode,
8461 PyObject *mapping,
8462 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008464 /* output object */
8465 PyObject *res = NULL;
8466 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008467 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008468 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008469 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008470 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008471 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008472 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008473 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008474 void *data;
8475 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476
Benjamin Petersonbac79492012-01-14 13:34:47 -05008477 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008478 return NULL;
8479 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008480 data = PyUnicode_DATA(unicode);
8481 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008482
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483 /* Default to Latin-1 */
8484 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008485 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008486
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008487 /* allocate enough for a simple encoding without
8488 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008489 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490 if (res == NULL)
8491 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008492 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008494
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008495 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008496 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008498 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 if (x==enc_EXCEPTION) /* error */
8500 goto onError;
8501 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008502 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008504 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 &res, &respos)) {
8506 goto onError;
8507 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008508 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 else
8510 /* done with this character => adjust input position */
8511 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008514 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008515 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008516 if (_PyBytes_Resize(&res, respos) < 0)
8517 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008518
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008520 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008521 return res;
8522
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524 Py_XDECREF(res);
8525 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008526 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527 return NULL;
8528}
8529
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008530/* Deprecated */
8531PyObject *
8532PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8533 Py_ssize_t size,
8534 PyObject *mapping,
8535 const char *errors)
8536{
8537 PyObject *result;
8538 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8539 if (unicode == NULL)
8540 return NULL;
8541 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8542 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008543 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008544}
8545
Alexander Belopolsky40018472011-02-26 01:02:56 +00008546PyObject *
8547PyUnicode_AsCharmapString(PyObject *unicode,
8548 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549{
8550 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008551 PyErr_BadArgument();
8552 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008553 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008554 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555}
8556
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008558static void
8559make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008561 Py_ssize_t startpos, Py_ssize_t endpos,
8562 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565 *exceptionObject = _PyUnicodeTranslateError_Create(
8566 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567 }
8568 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8570 goto onError;
8571 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8572 goto onError;
8573 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8574 goto onError;
8575 return;
8576 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008577 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 }
8579}
8580
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008581/* error handling callback helper:
8582 build arguments, call the callback and check the arguments,
8583 put the result into newpos and return the replacement string, which
8584 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008585static PyObject *
8586unicode_translate_call_errorhandler(const char *errors,
8587 PyObject **errorHandler,
8588 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008590 Py_ssize_t startpos, Py_ssize_t endpos,
8591 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02008593 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008594
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008595 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008596 PyObject *restuple;
8597 PyObject *resunicode;
8598
8599 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008600 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008601 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008603 }
8604
8605 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008607 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008609
8610 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008612 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008614 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008615 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 Py_DECREF(restuple);
8617 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008618 }
8619 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 &resunicode, &i_newpos)) {
8621 Py_DECREF(restuple);
8622 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008624 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008626 else
8627 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008629 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 Py_DECREF(restuple);
8631 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008632 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633 Py_INCREF(resunicode);
8634 Py_DECREF(restuple);
8635 return resunicode;
8636}
8637
8638/* Lookup the character ch in the mapping and put the result in result,
8639 which must be decrefed by the caller.
8640 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008641static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008642charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643{
Christian Heimes217cfd12007-12-02 14:31:20 +00008644 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008645 PyObject *x;
8646
8647 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008649 x = PyObject_GetItem(mapping, w);
8650 Py_DECREF(w);
8651 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8653 /* No mapping found means: use 1:1 mapping. */
8654 PyErr_Clear();
8655 *result = NULL;
8656 return 0;
8657 } else
8658 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008659 }
8660 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008661 *result = x;
8662 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008664 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008666 if (value < 0 || value > MAX_UNICODE) {
8667 PyErr_Format(PyExc_ValueError,
8668 "character mapping must be in range(0x%x)",
8669 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 Py_DECREF(x);
8671 return -1;
8672 }
8673 *result = x;
8674 return 0;
8675 }
8676 else if (PyUnicode_Check(x)) {
8677 *result = x;
8678 return 0;
8679 }
8680 else {
8681 /* wrong return value */
8682 PyErr_SetString(PyExc_TypeError,
8683 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008684 Py_DECREF(x);
8685 return -1;
8686 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687}
Victor Stinner1194ea02014-04-04 19:37:40 +02008688
8689/* lookup the character, write the result into the writer.
8690 Return 1 if the result was written into the writer, return 0 if the mapping
8691 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008692static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008693charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8694 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008695{
Victor Stinner1194ea02014-04-04 19:37:40 +02008696 PyObject *item;
8697
8698 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008700
8701 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008703 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008706 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008707 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008708
8709 if (item == Py_None) {
8710 Py_DECREF(item);
8711 return 0;
8712 }
8713
8714 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008715 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8716 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8717 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008718 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8719 Py_DECREF(item);
8720 return -1;
8721 }
8722 Py_DECREF(item);
8723 return 1;
8724 }
8725
8726 if (!PyUnicode_Check(item)) {
8727 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008729 }
8730
8731 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8732 Py_DECREF(item);
8733 return -1;
8734 }
8735
8736 Py_DECREF(item);
8737 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008738}
8739
Victor Stinner89a76ab2014-04-05 11:44:04 +02008740static int
8741unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8742 Py_UCS1 *translate)
8743{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008744 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008745 int ret = 0;
8746
Victor Stinner89a76ab2014-04-05 11:44:04 +02008747 if (charmaptranslate_lookup(ch, mapping, &item)) {
8748 return -1;
8749 }
8750
8751 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008752 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008753 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008754 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008755 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008756 /* not found => default to 1:1 mapping */
8757 translate[ch] = ch;
8758 return 1;
8759 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008760 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008761 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008762 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8763 used it */
8764 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008765 /* invalid character or character outside ASCII:
8766 skip the fast translate */
8767 goto exit;
8768 }
8769 translate[ch] = (Py_UCS1)replace;
8770 }
8771 else if (PyUnicode_Check(item)) {
8772 Py_UCS4 replace;
8773
8774 if (PyUnicode_READY(item) == -1) {
8775 Py_DECREF(item);
8776 return -1;
8777 }
8778 if (PyUnicode_GET_LENGTH(item) != 1)
8779 goto exit;
8780
8781 replace = PyUnicode_READ_CHAR(item, 0);
8782 if (replace > 127)
8783 goto exit;
8784 translate[ch] = (Py_UCS1)replace;
8785 }
8786 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008787 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008788 goto exit;
8789 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008790 ret = 1;
8791
Benjamin Peterson1365de72014-04-07 20:15:41 -04008792 exit:
8793 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008794 return ret;
8795}
8796
8797/* Fast path for ascii => ascii translation. Return 1 if the whole string
8798 was translated into writer, return 0 if the input string was partially
8799 translated into writer, raise an exception and return -1 on error. */
8800static int
8801unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008802 _PyUnicodeWriter *writer, int ignore,
8803 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008804{
Victor Stinner872b2912014-04-05 14:27:07 +02008805 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008806 Py_ssize_t len;
8807 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008808 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008809
Victor Stinner89a76ab2014-04-05 11:44:04 +02008810 len = PyUnicode_GET_LENGTH(input);
8811
Victor Stinner872b2912014-04-05 14:27:07 +02008812 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008813
8814 in = PyUnicode_1BYTE_DATA(input);
8815 end = in + len;
8816
8817 assert(PyUnicode_IS_ASCII(writer->buffer));
8818 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8819 out = PyUnicode_1BYTE_DATA(writer->buffer);
8820
Victor Stinner872b2912014-04-05 14:27:07 +02008821 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008822 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008823 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008824 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008825 int translate = unicode_fast_translate_lookup(mapping, ch,
8826 ascii_table);
8827 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008828 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008829 if (translate == 0)
8830 goto exit;
8831 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008832 }
Victor Stinner872b2912014-04-05 14:27:07 +02008833 if (ch2 == 0xfe) {
8834 if (ignore)
8835 continue;
8836 goto exit;
8837 }
8838 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008839 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008840 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008841 }
Victor Stinner872b2912014-04-05 14:27:07 +02008842 res = 1;
8843
8844exit:
8845 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008846 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008847 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008848}
8849
Victor Stinner3222da22015-10-01 22:07:32 +02008850static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851_PyUnicode_TranslateCharmap(PyObject *input,
8852 PyObject *mapping,
8853 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008856 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008857 Py_ssize_t size, i;
8858 int kind;
8859 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008860 _PyUnicodeWriter writer;
8861 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008862 char *reason = "character maps to <undefined>";
8863 PyObject *errorHandler = NULL;
8864 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008865 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008866 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008867
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008869 PyErr_BadArgument();
8870 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873 if (PyUnicode_READY(input) == -1)
8874 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008875 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876 kind = PyUnicode_KIND(input);
8877 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008878
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008879 if (size == 0)
8880 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008881
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008882 /* allocate enough for a simple 1:1 translation without
8883 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008884 _PyUnicodeWriter_Init(&writer);
8885 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887
Victor Stinner872b2912014-04-05 14:27:07 +02008888 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8889
Victor Stinner33798672016-03-01 21:59:58 +01008890 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008891 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008892 if (PyUnicode_IS_ASCII(input)) {
8893 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8894 if (res < 0) {
8895 _PyUnicodeWriter_Dealloc(&writer);
8896 return NULL;
8897 }
8898 if (res == 1)
8899 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008900 }
Victor Stinner33798672016-03-01 21:59:58 +01008901 else {
8902 i = 0;
8903 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008905 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008906 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008907 int translate;
8908 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8909 Py_ssize_t newpos;
8910 /* startpos for collecting untranslatable chars */
8911 Py_ssize_t collstart;
8912 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008913 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008914
Victor Stinner1194ea02014-04-04 19:37:40 +02008915 ch = PyUnicode_READ(kind, data, i);
8916 translate = charmaptranslate_output(ch, mapping, &writer);
8917 if (translate < 0)
8918 goto onError;
8919
8920 if (translate != 0) {
8921 /* it worked => adjust input pointer */
8922 ++i;
8923 continue;
8924 }
8925
8926 /* untranslatable character */
8927 collstart = i;
8928 collend = i+1;
8929
8930 /* find all untranslatable characters */
8931 while (collend < size) {
8932 PyObject *x;
8933 ch = PyUnicode_READ(kind, data, collend);
8934 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008935 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008936 Py_XDECREF(x);
8937 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008939 ++collend;
8940 }
8941
8942 if (ignore) {
8943 i = collend;
8944 }
8945 else {
8946 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8947 reason, input, &exc,
8948 collstart, collend, &newpos);
8949 if (repunicode == NULL)
8950 goto onError;
8951 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008952 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008953 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008954 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008955 Py_DECREF(repunicode);
8956 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008957 }
8958 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008959 Py_XDECREF(exc);
8960 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008961 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962
Benjamin Peterson29060642009-01-31 22:14:21 +00008963 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008964 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008965 Py_XDECREF(exc);
8966 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967 return NULL;
8968}
8969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970/* Deprecated. Use PyUnicode_Translate instead. */
8971PyObject *
8972PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8973 Py_ssize_t size,
8974 PyObject *mapping,
8975 const char *errors)
8976{
Christian Heimes5f520f42012-09-11 14:03:25 +02008977 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8979 if (!unicode)
8980 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008981 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8982 Py_DECREF(unicode);
8983 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984}
8985
Alexander Belopolsky40018472011-02-26 01:02:56 +00008986PyObject *
8987PyUnicode_Translate(PyObject *str,
8988 PyObject *mapping,
8989 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008991 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02008992 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008993 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994}
Tim Petersced69f82003-09-16 20:30:58 +00008995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008996static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008997fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008998{
8999 /* No need to call PyUnicode_READY(self) because this function is only
9000 called as a callback from fixup() which does it already. */
9001 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9002 const int kind = PyUnicode_KIND(self);
9003 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009004 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009005 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006 Py_ssize_t i;
9007
9008 for (i = 0; i < len; ++i) {
9009 ch = PyUnicode_READ(kind, data, i);
9010 fixed = 0;
9011 if (ch > 127) {
9012 if (Py_UNICODE_ISSPACE(ch))
9013 fixed = ' ';
9014 else {
9015 const int decimal = Py_UNICODE_TODECIMAL(ch);
9016 if (decimal >= 0)
9017 fixed = '0' + decimal;
9018 }
9019 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009020 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009021 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022 PyUnicode_WRITE(kind, data, i, fixed);
9023 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009024 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009025 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027 }
9028
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009029 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030}
9031
9032PyObject *
9033_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9034{
9035 if (!PyUnicode_Check(unicode)) {
9036 PyErr_BadInternalCall();
9037 return NULL;
9038 }
9039 if (PyUnicode_READY(unicode) == -1)
9040 return NULL;
9041 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9042 /* If the string is already ASCII, just return the same string */
9043 Py_INCREF(unicode);
9044 return unicode;
9045 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009046 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047}
9048
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009049PyObject *
9050PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9051 Py_ssize_t length)
9052{
Victor Stinnerf0124502011-11-21 23:12:56 +01009053 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009054 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009055 Py_UCS4 maxchar;
9056 enum PyUnicode_Kind kind;
9057 void *data;
9058
Victor Stinner99d7ad02012-02-22 13:37:39 +01009059 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009060 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009061 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009062 if (ch > 127) {
9063 int decimal = Py_UNICODE_TODECIMAL(ch);
9064 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009065 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009066 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009067 }
9068 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009069
9070 /* Copy to a new string */
9071 decimal = PyUnicode_New(length, maxchar);
9072 if (decimal == NULL)
9073 return decimal;
9074 kind = PyUnicode_KIND(decimal);
9075 data = PyUnicode_DATA(decimal);
9076 /* Iterate over code points */
9077 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009078 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009079 if (ch > 127) {
9080 int decimal = Py_UNICODE_TODECIMAL(ch);
9081 if (decimal >= 0)
9082 ch = '0' + decimal;
9083 }
9084 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009086 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009087}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009088/* --- Decimal Encoder ---------------------------------------------------- */
9089
Alexander Belopolsky40018472011-02-26 01:02:56 +00009090int
9091PyUnicode_EncodeDecimal(Py_UNICODE *s,
9092 Py_ssize_t length,
9093 char *output,
9094 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009095{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009096 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009097 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009098 enum PyUnicode_Kind kind;
9099 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009100
9101 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009102 PyErr_BadArgument();
9103 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009104 }
9105
Victor Stinner42bf7752011-11-21 22:52:58 +01009106 unicode = PyUnicode_FromUnicode(s, length);
9107 if (unicode == NULL)
9108 return -1;
9109
Benjamin Petersonbac79492012-01-14 13:34:47 -05009110 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009111 Py_DECREF(unicode);
9112 return -1;
9113 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009114 kind = PyUnicode_KIND(unicode);
9115 data = PyUnicode_DATA(unicode);
9116
Victor Stinnerb84d7232011-11-22 01:50:07 +01009117 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009118 PyObject *exc;
9119 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009120 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009121 Py_ssize_t startpos;
9122
9123 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009124
Benjamin Peterson29060642009-01-31 22:14:21 +00009125 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009126 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009127 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009128 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009129 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009130 decimal = Py_UNICODE_TODECIMAL(ch);
9131 if (decimal >= 0) {
9132 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009133 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 continue;
9135 }
9136 if (0 < ch && ch < 256) {
9137 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009138 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 continue;
9140 }
Victor Stinner6345be92011-11-25 20:09:01 +01009141
Victor Stinner42bf7752011-11-21 22:52:58 +01009142 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009143 exc = NULL;
9144 raise_encode_exception(&exc, "decimal", unicode,
9145 startpos, startpos+1,
9146 "invalid decimal Unicode string");
9147 Py_XDECREF(exc);
9148 Py_DECREF(unicode);
9149 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009150 }
9151 /* 0-terminate the output string */
9152 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009153 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009154 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009155}
9156
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157/* --- Helpers ------------------------------------------------------------ */
9158
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009159/* helper macro to fixup start/end slice values */
9160#define ADJUST_INDICES(start, end, len) \
9161 if (end > len) \
9162 end = len; \
9163 else if (end < 0) { \
9164 end += len; \
9165 if (end < 0) \
9166 end = 0; \
9167 } \
9168 if (start < 0) { \
9169 start += len; \
9170 if (start < 0) \
9171 start = 0; \
9172 }
9173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009175any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009177 Py_ssize_t end,
9178 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009180 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009181 void *buf1, *buf2;
9182 Py_ssize_t len1, len2, result;
9183
9184 kind1 = PyUnicode_KIND(s1);
9185 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009186 if (kind1 < kind2)
9187 return -1;
9188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 len1 = PyUnicode_GET_LENGTH(s1);
9190 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009191 ADJUST_INDICES(start, end, len1);
9192 if (end - start < len2)
9193 return -1;
9194
9195 buf1 = PyUnicode_DATA(s1);
9196 buf2 = PyUnicode_DATA(s2);
9197 if (len2 == 1) {
9198 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9199 result = findchar((const char *)buf1 + kind1*start,
9200 kind1, end - start, ch, direction);
9201 if (result == -1)
9202 return -1;
9203 else
9204 return start + result;
9205 }
9206
9207 if (kind2 != kind1) {
9208 buf2 = _PyUnicode_AsKind(s2, kind1);
9209 if (!buf2)
9210 return -2;
9211 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009212
Victor Stinner794d5672011-10-10 03:21:36 +02009213 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009214 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009215 case PyUnicode_1BYTE_KIND:
9216 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9217 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9218 else
9219 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9220 break;
9221 case PyUnicode_2BYTE_KIND:
9222 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9223 break;
9224 case PyUnicode_4BYTE_KIND:
9225 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9226 break;
9227 default:
9228 assert(0); result = -2;
9229 }
9230 }
9231 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009232 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009233 case PyUnicode_1BYTE_KIND:
9234 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9235 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9236 else
9237 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9238 break;
9239 case PyUnicode_2BYTE_KIND:
9240 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9241 break;
9242 case PyUnicode_4BYTE_KIND:
9243 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9244 break;
9245 default:
9246 assert(0); result = -2;
9247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248 }
9249
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009250 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009251 PyMem_Free(buf2);
9252
9253 return result;
9254}
9255
9256Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009257_PyUnicode_InsertThousandsGrouping(
9258 PyObject *unicode, Py_ssize_t index,
9259 Py_ssize_t n_buffer,
9260 void *digits, Py_ssize_t n_digits,
9261 Py_ssize_t min_width,
9262 const char *grouping, PyObject *thousands_sep,
9263 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009264{
Victor Stinner41a863c2012-02-24 00:37:51 +01009265 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009266 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009267 Py_ssize_t thousands_sep_len;
9268 Py_ssize_t len;
9269
9270 if (unicode != NULL) {
9271 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009272 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009273 }
9274 else {
9275 kind = PyUnicode_1BYTE_KIND;
9276 data = NULL;
9277 }
9278 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9279 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9280 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9281 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009282 if (thousands_sep_kind < kind) {
9283 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9284 if (!thousands_sep_data)
9285 return -1;
9286 }
9287 else {
9288 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9289 if (!data)
9290 return -1;
9291 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009292 }
9293
Benjamin Petersonead6b532011-12-20 17:23:42 -06009294 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009296 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009297 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009298 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009299 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009300 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009301 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009302 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009303 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009304 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009305 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009306 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009308 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009309 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009310 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009311 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009312 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009314 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009315 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009316 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009317 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009318 break;
9319 default:
9320 assert(0);
9321 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009323 if (unicode != NULL && thousands_sep_kind != kind) {
9324 if (thousands_sep_kind < kind)
9325 PyMem_Free(thousands_sep_data);
9326 else
9327 PyMem_Free(data);
9328 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009329 if (unicode == NULL) {
9330 *maxchar = 127;
9331 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009332 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009333 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009334 }
9335 }
9336 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337}
9338
9339
Alexander Belopolsky40018472011-02-26 01:02:56 +00009340Py_ssize_t
9341PyUnicode_Count(PyObject *str,
9342 PyObject *substr,
9343 Py_ssize_t start,
9344 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009345{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009346 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009347 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 void *buf1 = NULL, *buf2 = NULL;
9349 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009350
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009351 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009352 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009353
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009354 kind1 = PyUnicode_KIND(str);
9355 kind2 = PyUnicode_KIND(substr);
9356 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009357 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009358
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009359 len1 = PyUnicode_GET_LENGTH(str);
9360 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009362 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009363 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009364
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009365 buf1 = PyUnicode_DATA(str);
9366 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009367 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009368 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009369 if (!buf2)
9370 goto onError;
9371 }
9372
9373 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009375 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009376 result = asciilib_count(
9377 ((Py_UCS1*)buf1) + start, end - start,
9378 buf2, len2, PY_SSIZE_T_MAX
9379 );
9380 else
9381 result = ucs1lib_count(
9382 ((Py_UCS1*)buf1) + start, end - start,
9383 buf2, len2, PY_SSIZE_T_MAX
9384 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 break;
9386 case PyUnicode_2BYTE_KIND:
9387 result = ucs2lib_count(
9388 ((Py_UCS2*)buf1) + start, end - start,
9389 buf2, len2, PY_SSIZE_T_MAX
9390 );
9391 break;
9392 case PyUnicode_4BYTE_KIND:
9393 result = ucs4lib_count(
9394 ((Py_UCS4*)buf1) + start, end - start,
9395 buf2, len2, PY_SSIZE_T_MAX
9396 );
9397 break;
9398 default:
9399 assert(0); result = 0;
9400 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009401
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009402 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403 PyMem_Free(buf2);
9404
Guido van Rossumd57fd912000-03-10 22:53:23 +00009405 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009407 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408 PyMem_Free(buf2);
9409 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410}
9411
Alexander Belopolsky40018472011-02-26 01:02:56 +00009412Py_ssize_t
9413PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009414 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009415 Py_ssize_t start,
9416 Py_ssize_t end,
9417 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009418{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009419 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009420 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009421
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009422 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009423}
9424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425Py_ssize_t
9426PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9427 Py_ssize_t start, Py_ssize_t end,
9428 int direction)
9429{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009431 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 if (PyUnicode_READY(str) == -1)
9433 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009434 if (start < 0 || end < 0) {
9435 PyErr_SetString(PyExc_IndexError, "string index out of range");
9436 return -2;
9437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 if (end > PyUnicode_GET_LENGTH(str))
9439 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009440 if (start >= end)
9441 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009443 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9444 kind, end-start, ch, direction);
9445 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009446 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009447 else
9448 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449}
9450
Alexander Belopolsky40018472011-02-26 01:02:56 +00009451static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009452tailmatch(PyObject *self,
9453 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009454 Py_ssize_t start,
9455 Py_ssize_t end,
9456 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 int kind_self;
9459 int kind_sub;
9460 void *data_self;
9461 void *data_sub;
9462 Py_ssize_t offset;
9463 Py_ssize_t i;
9464 Py_ssize_t end_sub;
9465
9466 if (PyUnicode_READY(self) == -1 ||
9467 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009468 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9471 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009472 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009473 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009475 if (PyUnicode_GET_LENGTH(substring) == 0)
9476 return 1;
9477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 kind_self = PyUnicode_KIND(self);
9479 data_self = PyUnicode_DATA(self);
9480 kind_sub = PyUnicode_KIND(substring);
9481 data_sub = PyUnicode_DATA(substring);
9482 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9483
9484 if (direction > 0)
9485 offset = end;
9486 else
9487 offset = start;
9488
9489 if (PyUnicode_READ(kind_self, data_self, offset) ==
9490 PyUnicode_READ(kind_sub, data_sub, 0) &&
9491 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9492 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9493 /* If both are of the same kind, memcmp is sufficient */
9494 if (kind_self == kind_sub) {
9495 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009496 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497 data_sub,
9498 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009499 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009501 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 else {
9503 /* We do not need to compare 0 and len(substring)-1 because
9504 the if statement above ensured already that they are equal
9505 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 for (i = 1; i < end_sub; ++i) {
9507 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9508 PyUnicode_READ(kind_sub, data_sub, i))
9509 return 0;
9510 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009511 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009513 }
9514
9515 return 0;
9516}
9517
Alexander Belopolsky40018472011-02-26 01:02:56 +00009518Py_ssize_t
9519PyUnicode_Tailmatch(PyObject *str,
9520 PyObject *substr,
9521 Py_ssize_t start,
9522 Py_ssize_t end,
9523 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009524{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009525 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009526 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009527
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009528 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529}
9530
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531/* Apply fixfct filter to the Unicode object self and return a
9532 reference to the modified object */
9533
Alexander Belopolsky40018472011-02-26 01:02:56 +00009534static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009535fixup(PyObject *self,
9536 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538 PyObject *u;
9539 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009540 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009542 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009543 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009544 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009545 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009547 /* fix functions return the new maximum character in a string,
9548 if the kind of the resulting unicode object does not change,
9549 everything is fine. Otherwise we need to change the string kind
9550 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009551 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009552
9553 if (maxchar_new == 0) {
9554 /* no changes */;
9555 if (PyUnicode_CheckExact(self)) {
9556 Py_DECREF(u);
9557 Py_INCREF(self);
9558 return self;
9559 }
9560 else
9561 return u;
9562 }
9563
Victor Stinnere6abb482012-05-02 01:15:40 +02009564 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565
Victor Stinnereaab6042011-12-11 22:22:39 +01009566 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009568
9569 /* In case the maximum character changed, we need to
9570 convert the string to the new category. */
9571 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9572 if (v == NULL) {
9573 Py_DECREF(u);
9574 return NULL;
9575 }
9576 if (maxchar_new > maxchar_old) {
9577 /* If the maxchar increased so that the kind changed, not all
9578 characters are representable anymore and we need to fix the
9579 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009580 _PyUnicode_FastCopyCharacters(v, 0,
9581 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009582 maxchar_old = fixfct(v);
9583 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 }
9585 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009586 _PyUnicode_FastCopyCharacters(v, 0,
9587 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009589 Py_DECREF(u);
9590 assert(_PyUnicode_CheckConsistency(v, 1));
9591 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592}
9593
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009594static PyObject *
9595ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009596{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009597 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9598 char *resdata, *data = PyUnicode_DATA(self);
9599 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009600
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009601 res = PyUnicode_New(len, 127);
9602 if (res == NULL)
9603 return NULL;
9604 resdata = PyUnicode_DATA(res);
9605 if (lower)
9606 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009608 _Py_bytes_upper(resdata, data, len);
9609 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009610}
9611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009613handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009614{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009615 Py_ssize_t j;
9616 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009617 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009618 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009619
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009620 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9621
9622 where ! is a negation and \p{xxx} is a character with property xxx.
9623 */
9624 for (j = i - 1; j >= 0; j--) {
9625 c = PyUnicode_READ(kind, data, j);
9626 if (!_PyUnicode_IsCaseIgnorable(c))
9627 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009629 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9630 if (final_sigma) {
9631 for (j = i + 1; j < length; j++) {
9632 c = PyUnicode_READ(kind, data, j);
9633 if (!_PyUnicode_IsCaseIgnorable(c))
9634 break;
9635 }
9636 final_sigma = j == length || !_PyUnicode_IsCased(c);
9637 }
9638 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639}
9640
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009641static int
9642lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9643 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009645 /* Obscure special case. */
9646 if (c == 0x3A3) {
9647 mapped[0] = handle_capital_sigma(kind, data, length, i);
9648 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009650 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651}
9652
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009653static Py_ssize_t
9654do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009656 Py_ssize_t i, k = 0;
9657 int n_res, j;
9658 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009659
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009660 c = PyUnicode_READ(kind, data, 0);
9661 n_res = _PyUnicode_ToUpperFull(c, mapped);
9662 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009663 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009664 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009665 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009666 for (i = 1; i < length; i++) {
9667 c = PyUnicode_READ(kind, data, i);
9668 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9669 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009670 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009671 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009672 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009673 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009674 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675}
9676
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009677static Py_ssize_t
9678do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9679 Py_ssize_t i, k = 0;
9680
9681 for (i = 0; i < length; i++) {
9682 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9683 int n_res, j;
9684 if (Py_UNICODE_ISUPPER(c)) {
9685 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9686 }
9687 else if (Py_UNICODE_ISLOWER(c)) {
9688 n_res = _PyUnicode_ToUpperFull(c, mapped);
9689 }
9690 else {
9691 n_res = 1;
9692 mapped[0] = c;
9693 }
9694 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009695 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009696 res[k++] = mapped[j];
9697 }
9698 }
9699 return k;
9700}
9701
9702static Py_ssize_t
9703do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9704 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009706 Py_ssize_t i, k = 0;
9707
9708 for (i = 0; i < length; i++) {
9709 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9710 int n_res, j;
9711 if (lower)
9712 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9713 else
9714 n_res = _PyUnicode_ToUpperFull(c, mapped);
9715 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009716 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009717 res[k++] = mapped[j];
9718 }
9719 }
9720 return k;
9721}
9722
9723static Py_ssize_t
9724do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9725{
9726 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9727}
9728
9729static Py_ssize_t
9730do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9731{
9732 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9733}
9734
Benjamin Petersone51757f2012-01-12 21:10:29 -05009735static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009736do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9737{
9738 Py_ssize_t i, k = 0;
9739
9740 for (i = 0; i < length; i++) {
9741 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9742 Py_UCS4 mapped[3];
9743 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9744 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009745 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009746 res[k++] = mapped[j];
9747 }
9748 }
9749 return k;
9750}
9751
9752static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009753do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9754{
9755 Py_ssize_t i, k = 0;
9756 int previous_is_cased;
9757
9758 previous_is_cased = 0;
9759 for (i = 0; i < length; i++) {
9760 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9761 Py_UCS4 mapped[3];
9762 int n_res, j;
9763
9764 if (previous_is_cased)
9765 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9766 else
9767 n_res = _PyUnicode_ToTitleFull(c, mapped);
9768
9769 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009770 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009771 res[k++] = mapped[j];
9772 }
9773
9774 previous_is_cased = _PyUnicode_IsCased(c);
9775 }
9776 return k;
9777}
9778
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009779static PyObject *
9780case_operation(PyObject *self,
9781 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9782{
9783 PyObject *res = NULL;
9784 Py_ssize_t length, newlength = 0;
9785 int kind, outkind;
9786 void *data, *outdata;
9787 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9788
Benjamin Petersoneea48462012-01-16 14:28:50 -05009789 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009790
9791 kind = PyUnicode_KIND(self);
9792 data = PyUnicode_DATA(self);
9793 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009794 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009795 PyErr_SetString(PyExc_OverflowError, "string is too long");
9796 return NULL;
9797 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009798 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009799 if (tmp == NULL)
9800 return PyErr_NoMemory();
9801 newlength = perform(kind, data, length, tmp, &maxchar);
9802 res = PyUnicode_New(newlength, maxchar);
9803 if (res == NULL)
9804 goto leave;
9805 tmpend = tmp + newlength;
9806 outdata = PyUnicode_DATA(res);
9807 outkind = PyUnicode_KIND(res);
9808 switch (outkind) {
9809 case PyUnicode_1BYTE_KIND:
9810 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9811 break;
9812 case PyUnicode_2BYTE_KIND:
9813 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9814 break;
9815 case PyUnicode_4BYTE_KIND:
9816 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9817 break;
9818 default:
9819 assert(0);
9820 break;
9821 }
9822 leave:
9823 PyMem_FREE(tmp);
9824 return res;
9825}
9826
Tim Peters8ce9f162004-08-27 01:49:32 +00009827PyObject *
9828PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009829{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009831 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009833 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009834 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9835 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009836 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009838 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009840 int use_memcpy;
9841 unsigned char *res_data = NULL, *sep_data = NULL;
9842 PyObject *last_obj;
9843 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009844
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009845 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009846 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009847 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009848 }
9849
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009850 /* NOTE: the following code can't call back into Python code,
9851 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009852 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009853
Tim Peters05eba1f2004-08-27 21:32:02 +00009854 seqlen = PySequence_Fast_GET_SIZE(fseq);
9855 /* If empty sequence, return u"". */
9856 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009857 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009858 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009859 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009860
Tim Peters05eba1f2004-08-27 21:32:02 +00009861 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009862 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009863 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009864 if (seqlen == 1) {
9865 if (PyUnicode_CheckExact(items[0])) {
9866 res = items[0];
9867 Py_INCREF(res);
9868 Py_DECREF(fseq);
9869 return res;
9870 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009871 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009872 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009873 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009874 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009875 /* Set up sep and seplen */
9876 if (separator == NULL) {
9877 /* fall back to a blank space separator */
9878 sep = PyUnicode_FromOrdinal(' ');
9879 if (!sep)
9880 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009881 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009882 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009883 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009884 else {
9885 if (!PyUnicode_Check(separator)) {
9886 PyErr_Format(PyExc_TypeError,
9887 "separator: expected str instance,"
9888 " %.80s found",
9889 Py_TYPE(separator)->tp_name);
9890 goto onError;
9891 }
9892 if (PyUnicode_READY(separator))
9893 goto onError;
9894 sep = separator;
9895 seplen = PyUnicode_GET_LENGTH(separator);
9896 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9897 /* inc refcount to keep this code path symmetric with the
9898 above case of a blank separator */
9899 Py_INCREF(sep);
9900 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009901 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009902 }
9903
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009904 /* There are at least two things to join, or else we have a subclass
9905 * of str in the sequence.
9906 * Do a pre-pass to figure out the total amount of space we'll
9907 * need (sz), and see whether all argument are strings.
9908 */
9909 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009910#ifdef Py_DEBUG
9911 use_memcpy = 0;
9912#else
9913 use_memcpy = 1;
9914#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009915 for (i = 0; i < seqlen; i++) {
9916 const Py_ssize_t old_sz = sz;
9917 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009918 if (!PyUnicode_Check(item)) {
9919 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009920 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009921 " %.80s found",
9922 i, Py_TYPE(item)->tp_name);
9923 goto onError;
9924 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 if (PyUnicode_READY(item) == -1)
9926 goto onError;
9927 sz += PyUnicode_GET_LENGTH(item);
9928 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009929 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009930 if (i != 0)
9931 sz += seplen;
9932 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9933 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009934 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009935 goto onError;
9936 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009937 if (use_memcpy && last_obj != NULL) {
9938 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9939 use_memcpy = 0;
9940 }
9941 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009942 }
Tim Petersced69f82003-09-16 20:30:58 +00009943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009945 if (res == NULL)
9946 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009947
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009948 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009949#ifdef Py_DEBUG
9950 use_memcpy = 0;
9951#else
9952 if (use_memcpy) {
9953 res_data = PyUnicode_1BYTE_DATA(res);
9954 kind = PyUnicode_KIND(res);
9955 if (seplen != 0)
9956 sep_data = PyUnicode_1BYTE_DATA(sep);
9957 }
9958#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009959 if (use_memcpy) {
9960 for (i = 0; i < seqlen; ++i) {
9961 Py_ssize_t itemlen;
9962 item = items[i];
9963
9964 /* Copy item, and maybe the separator. */
9965 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009966 Py_MEMCPY(res_data,
9967 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009968 kind * seplen);
9969 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009970 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009971
9972 itemlen = PyUnicode_GET_LENGTH(item);
9973 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009974 Py_MEMCPY(res_data,
9975 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009976 kind * itemlen);
9977 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009978 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009979 }
9980 assert(res_data == PyUnicode_1BYTE_DATA(res)
9981 + kind * PyUnicode_GET_LENGTH(res));
9982 }
9983 else {
9984 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9985 Py_ssize_t itemlen;
9986 item = items[i];
9987
9988 /* Copy item, and maybe the separator. */
9989 if (i && seplen != 0) {
9990 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9991 res_offset += seplen;
9992 }
9993
9994 itemlen = PyUnicode_GET_LENGTH(item);
9995 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009996 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009997 res_offset += itemlen;
9998 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009999 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010000 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010001 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010002
Tim Peters05eba1f2004-08-27 21:32:02 +000010003 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010005 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010007
Benjamin Peterson29060642009-01-31 22:14:21 +000010008 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010009 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010011 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012 return NULL;
10013}
10014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015#define FILL(kind, data, value, start, length) \
10016 do { \
10017 Py_ssize_t i_ = 0; \
10018 assert(kind != PyUnicode_WCHAR_KIND); \
10019 switch ((kind)) { \
10020 case PyUnicode_1BYTE_KIND: { \
10021 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010022 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 break; \
10024 } \
10025 case PyUnicode_2BYTE_KIND: { \
10026 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10027 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10028 break; \
10029 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010030 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10032 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10033 break; \
10034 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010035 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 } \
10037 } while (0)
10038
Victor Stinnerd3f08822012-05-29 12:57:52 +020010039void
10040_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10041 Py_UCS4 fill_char)
10042{
10043 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10044 const void *data = PyUnicode_DATA(unicode);
10045 assert(PyUnicode_IS_READY(unicode));
10046 assert(unicode_modifiable(unicode));
10047 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10048 assert(start >= 0);
10049 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10050 FILL(kind, data, fill_char, start, length);
10051}
10052
Victor Stinner3fe55312012-01-04 00:33:50 +010010053Py_ssize_t
10054PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10055 Py_UCS4 fill_char)
10056{
10057 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010058
10059 if (!PyUnicode_Check(unicode)) {
10060 PyErr_BadInternalCall();
10061 return -1;
10062 }
10063 if (PyUnicode_READY(unicode) == -1)
10064 return -1;
10065 if (unicode_check_modifiable(unicode))
10066 return -1;
10067
Victor Stinnerd3f08822012-05-29 12:57:52 +020010068 if (start < 0) {
10069 PyErr_SetString(PyExc_IndexError, "string index out of range");
10070 return -1;
10071 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010072 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10073 PyErr_SetString(PyExc_ValueError,
10074 "fill character is bigger than "
10075 "the string maximum character");
10076 return -1;
10077 }
10078
10079 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10080 length = Py_MIN(maxlen, length);
10081 if (length <= 0)
10082 return 0;
10083
Victor Stinnerd3f08822012-05-29 12:57:52 +020010084 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010085 return length;
10086}
10087
Victor Stinner9310abb2011-10-05 00:59:23 +020010088static PyObject *
10089pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010090 Py_ssize_t left,
10091 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010093{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010094 PyObject *u;
10095 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010096 int kind;
10097 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010098
10099 if (left < 0)
10100 left = 0;
10101 if (right < 0)
10102 right = 0;
10103
Victor Stinnerc4b49542011-12-11 22:44:26 +010010104 if (left == 0 && right == 0)
10105 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10108 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010109 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10110 return NULL;
10111 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010113 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010115 if (!u)
10116 return NULL;
10117
10118 kind = PyUnicode_KIND(u);
10119 data = PyUnicode_DATA(u);
10120 if (left)
10121 FILL(kind, data, fill, 0, left);
10122 if (right)
10123 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010124 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010125 assert(_PyUnicode_CheckConsistency(u, 1));
10126 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010127}
10128
Alexander Belopolsky40018472011-02-26 01:02:56 +000010129PyObject *
10130PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010131{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010133
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010134 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010135 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136
Benjamin Petersonead6b532011-12-20 17:23:42 -060010137 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010139 if (PyUnicode_IS_ASCII(string))
10140 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010141 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010142 PyUnicode_GET_LENGTH(string), keepends);
10143 else
10144 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010145 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010146 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 break;
10148 case PyUnicode_2BYTE_KIND:
10149 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010150 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 PyUnicode_GET_LENGTH(string), keepends);
10152 break;
10153 case PyUnicode_4BYTE_KIND:
10154 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010155 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 PyUnicode_GET_LENGTH(string), keepends);
10157 break;
10158 default:
10159 assert(0);
10160 list = 0;
10161 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010162 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163}
10164
Alexander Belopolsky40018472011-02-26 01:02:56 +000010165static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010166split(PyObject *self,
10167 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010168 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010170 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 void *buf1, *buf2;
10172 Py_ssize_t len1, len2;
10173 PyObject* out;
10174
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010176 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 if (PyUnicode_READY(self) == -1)
10179 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010182 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010184 if (PyUnicode_IS_ASCII(self))
10185 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010186 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010187 PyUnicode_GET_LENGTH(self), maxcount
10188 );
10189 else
10190 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010191 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010192 PyUnicode_GET_LENGTH(self), maxcount
10193 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 case PyUnicode_2BYTE_KIND:
10195 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010196 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 PyUnicode_GET_LENGTH(self), maxcount
10198 );
10199 case PyUnicode_4BYTE_KIND:
10200 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010201 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 PyUnicode_GET_LENGTH(self), maxcount
10203 );
10204 default:
10205 assert(0);
10206 return NULL;
10207 }
10208
10209 if (PyUnicode_READY(substring) == -1)
10210 return NULL;
10211
10212 kind1 = PyUnicode_KIND(self);
10213 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 len1 = PyUnicode_GET_LENGTH(self);
10215 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010216 if (kind1 < kind2 || len1 < len2) {
10217 out = PyList_New(1);
10218 if (out == NULL)
10219 return NULL;
10220 Py_INCREF(self);
10221 PyList_SET_ITEM(out, 0, self);
10222 return out;
10223 }
10224 buf1 = PyUnicode_DATA(self);
10225 buf2 = PyUnicode_DATA(substring);
10226 if (kind2 != kind1) {
10227 buf2 = _PyUnicode_AsKind(substring, kind1);
10228 if (!buf2)
10229 return NULL;
10230 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010232 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010234 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10235 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010236 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010237 else
10238 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010239 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 break;
10241 case PyUnicode_2BYTE_KIND:
10242 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010243 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 break;
10245 case PyUnicode_4BYTE_KIND:
10246 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010247 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 break;
10249 default:
10250 out = NULL;
10251 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010252 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 PyMem_Free(buf2);
10254 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255}
10256
Alexander Belopolsky40018472011-02-26 01:02:56 +000010257static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010258rsplit(PyObject *self,
10259 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010260 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010261{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010262 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 void *buf1, *buf2;
10264 Py_ssize_t len1, len2;
10265 PyObject* out;
10266
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010267 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010268 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 if (PyUnicode_READY(self) == -1)
10271 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010274 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010276 if (PyUnicode_IS_ASCII(self))
10277 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010278 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010279 PyUnicode_GET_LENGTH(self), maxcount
10280 );
10281 else
10282 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010283 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010284 PyUnicode_GET_LENGTH(self), maxcount
10285 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 case PyUnicode_2BYTE_KIND:
10287 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010288 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 PyUnicode_GET_LENGTH(self), maxcount
10290 );
10291 case PyUnicode_4BYTE_KIND:
10292 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010293 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 PyUnicode_GET_LENGTH(self), maxcount
10295 );
10296 default:
10297 assert(0);
10298 return NULL;
10299 }
10300
10301 if (PyUnicode_READY(substring) == -1)
10302 return NULL;
10303
10304 kind1 = PyUnicode_KIND(self);
10305 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 len1 = PyUnicode_GET_LENGTH(self);
10307 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010308 if (kind1 < kind2 || len1 < len2) {
10309 out = PyList_New(1);
10310 if (out == NULL)
10311 return NULL;
10312 Py_INCREF(self);
10313 PyList_SET_ITEM(out, 0, self);
10314 return out;
10315 }
10316 buf1 = PyUnicode_DATA(self);
10317 buf2 = PyUnicode_DATA(substring);
10318 if (kind2 != kind1) {
10319 buf2 = _PyUnicode_AsKind(substring, kind1);
10320 if (!buf2)
10321 return NULL;
10322 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010324 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010326 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10327 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010328 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010329 else
10330 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010331 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 break;
10333 case PyUnicode_2BYTE_KIND:
10334 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010335 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 break;
10337 case PyUnicode_4BYTE_KIND:
10338 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010339 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 break;
10341 default:
10342 out = NULL;
10343 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010344 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 PyMem_Free(buf2);
10346 return out;
10347}
10348
10349static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010350anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10351 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010353 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010355 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10356 return asciilib_find(buf1, len1, buf2, len2, offset);
10357 else
10358 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 case PyUnicode_2BYTE_KIND:
10360 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10361 case PyUnicode_4BYTE_KIND:
10362 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10363 }
10364 assert(0);
10365 return -1;
10366}
10367
10368static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010369anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10370 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010372 switch (kind) {
10373 case PyUnicode_1BYTE_KIND:
10374 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10375 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10376 else
10377 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10378 case PyUnicode_2BYTE_KIND:
10379 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10380 case PyUnicode_4BYTE_KIND:
10381 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10382 }
10383 assert(0);
10384 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010385}
10386
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010387static void
10388replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10389 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10390{
10391 int kind = PyUnicode_KIND(u);
10392 void *data = PyUnicode_DATA(u);
10393 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10394 if (kind == PyUnicode_1BYTE_KIND) {
10395 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10396 (Py_UCS1 *)data + len,
10397 u1, u2, maxcount);
10398 }
10399 else if (kind == PyUnicode_2BYTE_KIND) {
10400 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10401 (Py_UCS2 *)data + len,
10402 u1, u2, maxcount);
10403 }
10404 else {
10405 assert(kind == PyUnicode_4BYTE_KIND);
10406 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10407 (Py_UCS4 *)data + len,
10408 u1, u2, maxcount);
10409 }
10410}
10411
Alexander Belopolsky40018472011-02-26 01:02:56 +000010412static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413replace(PyObject *self, PyObject *str1,
10414 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010415{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 PyObject *u;
10417 char *sbuf = PyUnicode_DATA(self);
10418 char *buf1 = PyUnicode_DATA(str1);
10419 char *buf2 = PyUnicode_DATA(str2);
10420 int srelease = 0, release1 = 0, release2 = 0;
10421 int skind = PyUnicode_KIND(self);
10422 int kind1 = PyUnicode_KIND(str1);
10423 int kind2 = PyUnicode_KIND(str2);
10424 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10425 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10426 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010427 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010428 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010429
10430 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010431 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010433 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010434
Victor Stinner59de0ee2011-10-07 10:01:28 +020010435 if (str1 == str2)
10436 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437
Victor Stinner49a0a212011-10-12 23:46:10 +020010438 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010439 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10440 if (maxchar < maxchar_str1)
10441 /* substring too wide to be present */
10442 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010443 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10444 /* Replacing str1 with str2 may cause a maxchar reduction in the
10445 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010446 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010447 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010450 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010452 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010454 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010455 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010456 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010457
Victor Stinner69ed0f42013-04-09 21:48:24 +020010458 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010459 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010460 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010461 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010462 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010464 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010466
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010467 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10468 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010469 }
10470 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 int rkind = skind;
10472 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010473 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 if (kind1 < rkind) {
10476 /* widen substring */
10477 buf1 = _PyUnicode_AsKind(str1, rkind);
10478 if (!buf1) goto error;
10479 release1 = 1;
10480 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010481 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010482 if (i < 0)
10483 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 if (rkind > kind2) {
10485 /* widen replacement */
10486 buf2 = _PyUnicode_AsKind(str2, rkind);
10487 if (!buf2) goto error;
10488 release2 = 1;
10489 }
10490 else if (rkind < kind2) {
10491 /* widen self and buf1 */
10492 rkind = kind2;
10493 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010494 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 sbuf = _PyUnicode_AsKind(self, rkind);
10496 if (!sbuf) goto error;
10497 srelease = 1;
10498 buf1 = _PyUnicode_AsKind(str1, rkind);
10499 if (!buf1) goto error;
10500 release1 = 1;
10501 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010502 u = PyUnicode_New(slen, maxchar);
10503 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010505 assert(PyUnicode_KIND(u) == rkind);
10506 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010507
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010508 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010509 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010510 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010512 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010514
10515 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010516 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010517 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010518 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010519 if (i == -1)
10520 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010521 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010523 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010527 }
10528 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010529 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010530 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 int rkind = skind;
10532 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010535 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 buf1 = _PyUnicode_AsKind(str1, rkind);
10537 if (!buf1) goto error;
10538 release1 = 1;
10539 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010540 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010541 if (n == 0)
10542 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010544 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 buf2 = _PyUnicode_AsKind(str2, rkind);
10546 if (!buf2) goto error;
10547 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010550 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 rkind = kind2;
10552 sbuf = _PyUnicode_AsKind(self, rkind);
10553 if (!sbuf) goto error;
10554 srelease = 1;
10555 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010556 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 buf1 = _PyUnicode_AsKind(str1, rkind);
10558 if (!buf1) goto error;
10559 release1 = 1;
10560 }
10561 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10562 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010563 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 PyErr_SetString(PyExc_OverflowError,
10565 "replace string is too long");
10566 goto error;
10567 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010568 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010569 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010570 _Py_INCREF_UNICODE_EMPTY();
10571 if (!unicode_empty)
10572 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010573 u = unicode_empty;
10574 goto done;
10575 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010576 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 PyErr_SetString(PyExc_OverflowError,
10578 "replace string is too long");
10579 goto error;
10580 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010581 u = PyUnicode_New(new_size, maxchar);
10582 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010584 assert(PyUnicode_KIND(u) == rkind);
10585 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 ires = i = 0;
10587 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010588 while (n-- > 0) {
10589 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010590 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010591 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010592 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010593 if (j == -1)
10594 break;
10595 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010596 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010597 memcpy(res + rkind * ires,
10598 sbuf + rkind * i,
10599 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010601 }
10602 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010604 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010606 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010608 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010612 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010613 memcpy(res + rkind * ires,
10614 sbuf + rkind * i,
10615 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010616 }
10617 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010618 /* interleave */
10619 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010620 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010622 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010624 if (--n <= 0)
10625 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010626 memcpy(res + rkind * ires,
10627 sbuf + rkind * i,
10628 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 ires++;
10630 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010631 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010632 memcpy(res + rkind * ires,
10633 sbuf + rkind * i,
10634 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010635 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010636 }
10637
10638 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010639 unicode_adjust_maxchar(&u);
10640 if (u == NULL)
10641 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010643
10644 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 if (srelease)
10646 PyMem_FREE(sbuf);
10647 if (release1)
10648 PyMem_FREE(buf1);
10649 if (release2)
10650 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010651 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010653
Benjamin Peterson29060642009-01-31 22:14:21 +000010654 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010655 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 if (srelease)
10657 PyMem_FREE(sbuf);
10658 if (release1)
10659 PyMem_FREE(buf1);
10660 if (release2)
10661 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010662 return unicode_result_unchanged(self);
10663
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 error:
10665 if (srelease && sbuf)
10666 PyMem_FREE(sbuf);
10667 if (release1 && buf1)
10668 PyMem_FREE(buf1);
10669 if (release2 && buf2)
10670 PyMem_FREE(buf2);
10671 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672}
10673
10674/* --- Unicode Object Methods --------------------------------------------- */
10675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010676PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010677 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678\n\
10679Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010680characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010681
10682static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010683unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010684{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010685 if (PyUnicode_READY(self) == -1)
10686 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010687 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010688}
10689
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010690PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010691 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010692\n\
10693Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010694have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695
10696static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010697unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010699 if (PyUnicode_READY(self) == -1)
10700 return NULL;
10701 if (PyUnicode_GET_LENGTH(self) == 0)
10702 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010703 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704}
10705
Benjamin Petersond5890c82012-01-14 13:23:30 -050010706PyDoc_STRVAR(casefold__doc__,
10707 "S.casefold() -> str\n\
10708\n\
10709Return a version of S suitable for caseless comparisons.");
10710
10711static PyObject *
10712unicode_casefold(PyObject *self)
10713{
10714 if (PyUnicode_READY(self) == -1)
10715 return NULL;
10716 if (PyUnicode_IS_ASCII(self))
10717 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010718 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010719}
10720
10721
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010722/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010723
10724static int
10725convert_uc(PyObject *obj, void *addr)
10726{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010728
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010729 if (!PyUnicode_Check(obj)) {
10730 PyErr_Format(PyExc_TypeError,
10731 "The fill character must be a unicode character, "
10732 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010733 return 0;
10734 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010735 if (PyUnicode_READY(obj) < 0)
10736 return 0;
10737 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010738 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010739 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010740 return 0;
10741 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010742 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010743 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010744}
10745
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010746PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010747 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010749Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010750done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751
10752static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010753unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010755 Py_ssize_t marg, left;
10756 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 Py_UCS4 fillchar = ' ';
10758
Victor Stinnere9a29352011-10-01 02:14:59 +020010759 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010760 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761
Benjamin Petersonbac79492012-01-14 13:34:47 -050010762 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763 return NULL;
10764
Victor Stinnerc4b49542011-12-11 22:44:26 +010010765 if (PyUnicode_GET_LENGTH(self) >= width)
10766 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767
Victor Stinnerc4b49542011-12-11 22:44:26 +010010768 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769 left = marg / 2 + (marg & width & 1);
10770
Victor Stinner9310abb2011-10-05 00:59:23 +020010771 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772}
10773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774/* This function assumes that str1 and str2 are readied by the caller. */
10775
Marc-André Lemburge5034372000-08-08 08:04:29 +000010776static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010777unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010778{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010779#define COMPARE(TYPE1, TYPE2) \
10780 do { \
10781 TYPE1* p1 = (TYPE1 *)data1; \
10782 TYPE2* p2 = (TYPE2 *)data2; \
10783 TYPE1* end = p1 + len; \
10784 Py_UCS4 c1, c2; \
10785 for (; p1 != end; p1++, p2++) { \
10786 c1 = *p1; \
10787 c2 = *p2; \
10788 if (c1 != c2) \
10789 return (c1 < c2) ? -1 : 1; \
10790 } \
10791 } \
10792 while (0)
10793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 int kind1, kind2;
10795 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010796 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 kind1 = PyUnicode_KIND(str1);
10799 kind2 = PyUnicode_KIND(str2);
10800 data1 = PyUnicode_DATA(str1);
10801 data2 = PyUnicode_DATA(str2);
10802 len1 = PyUnicode_GET_LENGTH(str1);
10803 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010804 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010805
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010806 switch(kind1) {
10807 case PyUnicode_1BYTE_KIND:
10808 {
10809 switch(kind2) {
10810 case PyUnicode_1BYTE_KIND:
10811 {
10812 int cmp = memcmp(data1, data2, len);
10813 /* normalize result of memcmp() into the range [-1; 1] */
10814 if (cmp < 0)
10815 return -1;
10816 if (cmp > 0)
10817 return 1;
10818 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010819 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010820 case PyUnicode_2BYTE_KIND:
10821 COMPARE(Py_UCS1, Py_UCS2);
10822 break;
10823 case PyUnicode_4BYTE_KIND:
10824 COMPARE(Py_UCS1, Py_UCS4);
10825 break;
10826 default:
10827 assert(0);
10828 }
10829 break;
10830 }
10831 case PyUnicode_2BYTE_KIND:
10832 {
10833 switch(kind2) {
10834 case PyUnicode_1BYTE_KIND:
10835 COMPARE(Py_UCS2, Py_UCS1);
10836 break;
10837 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010838 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010839 COMPARE(Py_UCS2, Py_UCS2);
10840 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010841 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010842 case PyUnicode_4BYTE_KIND:
10843 COMPARE(Py_UCS2, Py_UCS4);
10844 break;
10845 default:
10846 assert(0);
10847 }
10848 break;
10849 }
10850 case PyUnicode_4BYTE_KIND:
10851 {
10852 switch(kind2) {
10853 case PyUnicode_1BYTE_KIND:
10854 COMPARE(Py_UCS4, Py_UCS1);
10855 break;
10856 case PyUnicode_2BYTE_KIND:
10857 COMPARE(Py_UCS4, Py_UCS2);
10858 break;
10859 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010860 {
10861#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10862 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10863 /* normalize result of wmemcmp() into the range [-1; 1] */
10864 if (cmp < 0)
10865 return -1;
10866 if (cmp > 0)
10867 return 1;
10868#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010869 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010870#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010871 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010872 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010873 default:
10874 assert(0);
10875 }
10876 break;
10877 }
10878 default:
10879 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010880 }
10881
Victor Stinner770e19e2012-10-04 22:59:45 +020010882 if (len1 == len2)
10883 return 0;
10884 if (len1 < len2)
10885 return -1;
10886 else
10887 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010888
10889#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010890}
10891
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010892Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010893unicode_compare_eq(PyObject *str1, PyObject *str2)
10894{
10895 int kind;
10896 void *data1, *data2;
10897 Py_ssize_t len;
10898 int cmp;
10899
Victor Stinnere5567ad2012-10-23 02:48:49 +020010900 len = PyUnicode_GET_LENGTH(str1);
10901 if (PyUnicode_GET_LENGTH(str2) != len)
10902 return 0;
10903 kind = PyUnicode_KIND(str1);
10904 if (PyUnicode_KIND(str2) != kind)
10905 return 0;
10906 data1 = PyUnicode_DATA(str1);
10907 data2 = PyUnicode_DATA(str2);
10908
10909 cmp = memcmp(data1, data2, len * kind);
10910 return (cmp == 0);
10911}
10912
10913
Alexander Belopolsky40018472011-02-26 01:02:56 +000010914int
10915PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010917 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10918 if (PyUnicode_READY(left) == -1 ||
10919 PyUnicode_READY(right) == -1)
10920 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010921
10922 /* a string is equal to itself */
10923 if (left == right)
10924 return 0;
10925
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010926 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010928 PyErr_Format(PyExc_TypeError,
10929 "Can't compare %.100s and %.100s",
10930 left->ob_type->tp_name,
10931 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932 return -1;
10933}
10934
Martin v. Löwis5b222132007-06-10 09:51:05 +000010935int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010936_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10937{
10938 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10939 if (right_str == NULL)
10940 return -1;
10941 return PyUnicode_Compare(left, right_str);
10942}
10943
10944int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010945PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10946{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 Py_ssize_t i;
10948 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 Py_UCS4 chr;
10950
Victor Stinner910337b2011-10-03 03:20:16 +020010951 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 if (PyUnicode_READY(uni) == -1)
10953 return -1;
10954 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010955 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010956 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010957 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010958 size_t len, len2 = strlen(str);
10959 int cmp;
10960
10961 len = Py_MIN(len1, len2);
10962 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010963 if (cmp != 0) {
10964 if (cmp < 0)
10965 return -1;
10966 else
10967 return 1;
10968 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010969 if (len1 > len2)
10970 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010971 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010972 return -1; /* str is longer */
10973 return 0;
10974 }
10975 else {
10976 void *data = PyUnicode_DATA(uni);
10977 /* Compare Unicode string and source character set string */
10978 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010979 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010980 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10981 /* This check keeps Python strings that end in '\0' from comparing equal
10982 to C strings identical up to that point. */
10983 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10984 return 1; /* uni is longer */
10985 if (str[i])
10986 return -1; /* str is longer */
10987 return 0;
10988 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010989}
10990
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010991
Benjamin Peterson29060642009-01-31 22:14:21 +000010992#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010993 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010994
Alexander Belopolsky40018472011-02-26 01:02:56 +000010995PyObject *
10996PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010997{
10998 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010999 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011000
Victor Stinnere5567ad2012-10-23 02:48:49 +020011001 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11002 Py_RETURN_NOTIMPLEMENTED;
11003
11004 if (PyUnicode_READY(left) == -1 ||
11005 PyUnicode_READY(right) == -1)
11006 return NULL;
11007
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011008 if (left == right) {
11009 switch (op) {
11010 case Py_EQ:
11011 case Py_LE:
11012 case Py_GE:
11013 /* a string is equal to itself */
11014 v = Py_True;
11015 break;
11016 case Py_NE:
11017 case Py_LT:
11018 case Py_GT:
11019 v = Py_False;
11020 break;
11021 default:
11022 PyErr_BadArgument();
11023 return NULL;
11024 }
11025 }
11026 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011027 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011028 result ^= (op == Py_NE);
11029 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011030 }
11031 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011032 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011033
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011034 /* Convert the return value to a Boolean */
11035 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011036 case Py_LE:
11037 v = TEST_COND(result <= 0);
11038 break;
11039 case Py_GE:
11040 v = TEST_COND(result >= 0);
11041 break;
11042 case Py_LT:
11043 v = TEST_COND(result == -1);
11044 break;
11045 case Py_GT:
11046 v = TEST_COND(result == 1);
11047 break;
11048 default:
11049 PyErr_BadArgument();
11050 return NULL;
11051 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011052 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011053 Py_INCREF(v);
11054 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011055}
11056
Alexander Belopolsky40018472011-02-26 01:02:56 +000011057int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011058_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11059{
11060 return unicode_eq(aa, bb);
11061}
11062
11063int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011064PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011065{
Victor Stinner77282cb2013-04-14 19:22:47 +020011066 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011067 void *buf1, *buf2;
11068 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011069 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011070
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011071 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011072 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011073 "'in <string>' requires string as left operand, not %.100s",
11074 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011075 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011076 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011077 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011078 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011079 if (ensure_unicode(str) < 0)
11080 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011083 kind2 = PyUnicode_KIND(substr);
11084 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011085 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011087 len2 = PyUnicode_GET_LENGTH(substr);
11088 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011089 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011090 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011091 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011092 if (len2 == 1) {
11093 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11094 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011095 return result;
11096 }
11097 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011098 buf2 = _PyUnicode_AsKind(substr, kind1);
11099 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011100 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011101 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011102
Victor Stinner77282cb2013-04-14 19:22:47 +020011103 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011104 case PyUnicode_1BYTE_KIND:
11105 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11106 break;
11107 case PyUnicode_2BYTE_KIND:
11108 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11109 break;
11110 case PyUnicode_4BYTE_KIND:
11111 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11112 break;
11113 default:
11114 result = -1;
11115 assert(0);
11116 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011117
Victor Stinner77282cb2013-04-14 19:22:47 +020011118 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 PyMem_Free(buf2);
11120
Guido van Rossum403d68b2000-03-13 15:55:09 +000011121 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011122}
11123
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124/* Concat to string or Unicode object giving a new Unicode object. */
11125
Alexander Belopolsky40018472011-02-26 01:02:56 +000011126PyObject *
11127PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011129 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011130 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011131 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011133 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11134 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135
11136 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011137 if (left == unicode_empty)
11138 return PyUnicode_FromObject(right);
11139 if (right == unicode_empty)
11140 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011142 left_len = PyUnicode_GET_LENGTH(left);
11143 right_len = PyUnicode_GET_LENGTH(right);
11144 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011145 PyErr_SetString(PyExc_OverflowError,
11146 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011147 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011148 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011149 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011150
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011151 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11152 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011153 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011154
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011156 result = PyUnicode_New(new_len, maxchar);
11157 if (result == NULL)
11158 return NULL;
11159 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11160 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11161 assert(_PyUnicode_CheckConsistency(result, 1));
11162 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163}
11164
Walter Dörwald1ab83302007-05-18 17:15:44 +000011165void
Victor Stinner23e56682011-10-03 03:54:37 +020011166PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011167{
Victor Stinner23e56682011-10-03 03:54:37 +020011168 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011169 Py_UCS4 maxchar, maxchar2;
11170 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011171
11172 if (p_left == NULL) {
11173 if (!PyErr_Occurred())
11174 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011175 return;
11176 }
Victor Stinner23e56682011-10-03 03:54:37 +020011177 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011178 if (right == NULL || left == NULL
11179 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011180 if (!PyErr_Occurred())
11181 PyErr_BadInternalCall();
11182 goto error;
11183 }
11184
Benjamin Petersonbac79492012-01-14 13:34:47 -050011185 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011186 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011187 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011188 goto error;
11189
Victor Stinner488fa492011-12-12 00:01:39 +010011190 /* Shortcuts */
11191 if (left == unicode_empty) {
11192 Py_DECREF(left);
11193 Py_INCREF(right);
11194 *p_left = right;
11195 return;
11196 }
11197 if (right == unicode_empty)
11198 return;
11199
11200 left_len = PyUnicode_GET_LENGTH(left);
11201 right_len = PyUnicode_GET_LENGTH(right);
11202 if (left_len > PY_SSIZE_T_MAX - right_len) {
11203 PyErr_SetString(PyExc_OverflowError,
11204 "strings are too large to concat");
11205 goto error;
11206 }
11207 new_len = left_len + right_len;
11208
11209 if (unicode_modifiable(left)
11210 && PyUnicode_CheckExact(right)
11211 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011212 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11213 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011214 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011215 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011216 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11217 {
11218 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011219 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011220 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011221
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011222 /* copy 'right' into the newly allocated area of 'left' */
11223 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011224 }
Victor Stinner488fa492011-12-12 00:01:39 +010011225 else {
11226 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11227 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011228 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011229
Victor Stinner488fa492011-12-12 00:01:39 +010011230 /* Concat the two Unicode strings */
11231 res = PyUnicode_New(new_len, maxchar);
11232 if (res == NULL)
11233 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011234 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11235 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011236 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011237 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011238 }
11239 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011240 return;
11241
11242error:
Victor Stinner488fa492011-12-12 00:01:39 +010011243 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011244}
11245
11246void
11247PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11248{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011249 PyUnicode_Append(pleft, right);
11250 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011251}
11252
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011253/*
11254Wraps stringlib_parse_args_finds() and additionally ensures that the
11255first argument is a unicode object.
11256*/
11257
11258Py_LOCAL_INLINE(int)
11259parse_args_finds_unicode(const char * function_name, PyObject *args,
11260 PyObject **substring,
11261 Py_ssize_t *start, Py_ssize_t *end)
11262{
11263 if(stringlib_parse_args_finds(function_name, args, substring,
11264 start, end)) {
11265 if (ensure_unicode(*substring) < 0)
11266 return 0;
11267 return 1;
11268 }
11269 return 0;
11270}
11271
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011272PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011273 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011275Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011276string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011277interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278
11279static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011280unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011282 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011283 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011284 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011286 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 void *buf1, *buf2;
11288 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011290 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011291 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011293 kind1 = PyUnicode_KIND(self);
11294 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011295 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011296 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011298 len1 = PyUnicode_GET_LENGTH(self);
11299 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011300 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011301 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011302 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011303
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011304 buf1 = PyUnicode_DATA(self);
11305 buf2 = PyUnicode_DATA(substring);
11306 if (kind2 != kind1) {
11307 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011308 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011309 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011310 }
11311 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011312 case PyUnicode_1BYTE_KIND:
11313 iresult = ucs1lib_count(
11314 ((Py_UCS1*)buf1) + start, end - start,
11315 buf2, len2, PY_SSIZE_T_MAX
11316 );
11317 break;
11318 case PyUnicode_2BYTE_KIND:
11319 iresult = ucs2lib_count(
11320 ((Py_UCS2*)buf1) + start, end - start,
11321 buf2, len2, PY_SSIZE_T_MAX
11322 );
11323 break;
11324 case PyUnicode_4BYTE_KIND:
11325 iresult = ucs4lib_count(
11326 ((Py_UCS4*)buf1) + start, end - start,
11327 buf2, len2, PY_SSIZE_T_MAX
11328 );
11329 break;
11330 default:
11331 assert(0); iresult = 0;
11332 }
11333
11334 result = PyLong_FromSsize_t(iresult);
11335
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011336 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339 return result;
11340}
11341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011342PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011343 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011345Encode S using the codec registered for encoding. Default encoding\n\
11346is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011347handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011348a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11349'xmlcharrefreplace' as well as any other name registered with\n\
11350codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351
11352static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011353unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011355 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356 char *encoding = NULL;
11357 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011358
Benjamin Peterson308d6372009-09-18 21:42:35 +000011359 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11360 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011362 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011363}
11364
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011365PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011366 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367\n\
11368Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011369If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370
11371static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011372unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011374 Py_ssize_t i, j, line_pos, src_len, incr;
11375 Py_UCS4 ch;
11376 PyObject *u;
11377 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011378 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011380 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011381 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382
Ezio Melotti745d54d2013-11-16 19:10:57 +020011383 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11384 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011385 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386
Antoine Pitrou22425222011-10-04 19:10:51 +020011387 if (PyUnicode_READY(self) == -1)
11388 return NULL;
11389
Thomas Wouters7e474022000-07-16 12:04:32 +000011390 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011391 src_len = PyUnicode_GET_LENGTH(self);
11392 i = j = line_pos = 0;
11393 kind = PyUnicode_KIND(self);
11394 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011395 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011396 for (; i < src_len; i++) {
11397 ch = PyUnicode_READ(kind, src_data, i);
11398 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011399 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011400 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011401 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011402 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011403 goto overflow;
11404 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011405 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011406 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011409 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011410 goto overflow;
11411 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011413 if (ch == '\n' || ch == '\r')
11414 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011416 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011417 if (!found)
11418 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011419
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011421 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422 if (!u)
11423 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011424 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
Antoine Pitroue71d5742011-10-04 15:55:09 +020011426 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427
Antoine Pitroue71d5742011-10-04 15:55:09 +020011428 for (; i < src_len; i++) {
11429 ch = PyUnicode_READ(kind, src_data, i);
11430 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011431 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011432 incr = tabsize - (line_pos % tabsize);
11433 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011434 FILL(kind, dest_data, ' ', j, incr);
11435 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011436 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011437 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011438 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011439 line_pos++;
11440 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011441 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011442 if (ch == '\n' || ch == '\r')
11443 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011445 }
11446 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011447 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011448
Antoine Pitroue71d5742011-10-04 15:55:09 +020011449 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011450 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11451 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452}
11453
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011454PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011455 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456\n\
11457Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011458such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459arguments start and end are interpreted as in slice notation.\n\
11460\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011461Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462
11463static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011466 /* initialize variables to prevent gcc warning */
11467 PyObject *substring = NULL;
11468 Py_ssize_t start = 0;
11469 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011470 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011472 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011475 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011478 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480 if (result == -2)
11481 return NULL;
11482
Christian Heimes217cfd12007-12-02 14:31:20 +000011483 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484}
11485
11486static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011487unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011489 void *data;
11490 enum PyUnicode_Kind kind;
11491 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011492
11493 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11494 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011495 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011496 }
11497 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11498 PyErr_SetString(PyExc_IndexError, "string index out of range");
11499 return NULL;
11500 }
11501 kind = PyUnicode_KIND(self);
11502 data = PyUnicode_DATA(self);
11503 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011504 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505}
11506
Guido van Rossumc2504932007-09-18 19:42:40 +000011507/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011508 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011509static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011510unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511{
Guido van Rossumc2504932007-09-18 19:42:40 +000011512 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011513 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011514
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011515#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011516 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011517#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 if (_PyUnicode_HASH(self) != -1)
11519 return _PyUnicode_HASH(self);
11520 if (PyUnicode_READY(self) == -1)
11521 return -1;
11522 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011523 /*
11524 We make the hash of the empty string be 0, rather than using
11525 (prefix ^ suffix), since this slightly obfuscates the hash secret
11526 */
11527 if (len == 0) {
11528 _PyUnicode_HASH(self) = 0;
11529 return 0;
11530 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011531 x = _Py_HashBytes(PyUnicode_DATA(self),
11532 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011534 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535}
11536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011537PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011538 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011540Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541
11542static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011545 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011546 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011547 PyObject *substring = NULL;
11548 Py_ssize_t start = 0;
11549 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011551 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011554 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011557 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559 if (result == -2)
11560 return NULL;
11561
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562 if (result < 0) {
11563 PyErr_SetString(PyExc_ValueError, "substring not found");
11564 return NULL;
11565 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011566
Christian Heimes217cfd12007-12-02 14:31:20 +000011567 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568}
11569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011570PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011571 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011573Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011574at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575
11576static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011577unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579 Py_ssize_t i, length;
11580 int kind;
11581 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582 int cased;
11583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584 if (PyUnicode_READY(self) == -1)
11585 return NULL;
11586 length = PyUnicode_GET_LENGTH(self);
11587 kind = PyUnicode_KIND(self);
11588 data = PyUnicode_DATA(self);
11589
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591 if (length == 1)
11592 return PyBool_FromLong(
11593 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011595 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011597 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011598
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600 for (i = 0; i < length; i++) {
11601 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011602
Benjamin Peterson29060642009-01-31 22:14:21 +000011603 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11604 return PyBool_FromLong(0);
11605 else if (!cased && Py_UNICODE_ISLOWER(ch))
11606 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011608 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609}
11610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011611PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011612 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011614Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011615at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616
11617static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011618unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 Py_ssize_t i, length;
11621 int kind;
11622 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623 int cased;
11624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 if (PyUnicode_READY(self) == -1)
11626 return NULL;
11627 length = PyUnicode_GET_LENGTH(self);
11628 kind = PyUnicode_KIND(self);
11629 data = PyUnicode_DATA(self);
11630
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 if (length == 1)
11633 return PyBool_FromLong(
11634 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011636 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011638 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011639
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 for (i = 0; i < length; i++) {
11642 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011643
Benjamin Peterson29060642009-01-31 22:14:21 +000011644 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11645 return PyBool_FromLong(0);
11646 else if (!cased && Py_UNICODE_ISUPPER(ch))
11647 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011649 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650}
11651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011652PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011653 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011655Return True if S is a titlecased string and there is at least one\n\
11656character in S, i.e. upper- and titlecase characters may only\n\
11657follow uncased characters and lowercase characters only cased ones.\n\
11658Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659
11660static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011661unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 Py_ssize_t i, length;
11664 int kind;
11665 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666 int cased, previous_is_cased;
11667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011668 if (PyUnicode_READY(self) == -1)
11669 return NULL;
11670 length = PyUnicode_GET_LENGTH(self);
11671 kind = PyUnicode_KIND(self);
11672 data = PyUnicode_DATA(self);
11673
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675 if (length == 1) {
11676 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11677 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11678 (Py_UNICODE_ISUPPER(ch) != 0));
11679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011681 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011683 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011684
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685 cased = 0;
11686 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 for (i = 0; i < length; i++) {
11688 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011689
Benjamin Peterson29060642009-01-31 22:14:21 +000011690 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11691 if (previous_is_cased)
11692 return PyBool_FromLong(0);
11693 previous_is_cased = 1;
11694 cased = 1;
11695 }
11696 else if (Py_UNICODE_ISLOWER(ch)) {
11697 if (!previous_is_cased)
11698 return PyBool_FromLong(0);
11699 previous_is_cased = 1;
11700 cased = 1;
11701 }
11702 else
11703 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011705 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706}
11707
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011708PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011709 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011711Return True if all characters in S are whitespace\n\
11712and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713
11714static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011715unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717 Py_ssize_t i, length;
11718 int kind;
11719 void *data;
11720
11721 if (PyUnicode_READY(self) == -1)
11722 return NULL;
11723 length = PyUnicode_GET_LENGTH(self);
11724 kind = PyUnicode_KIND(self);
11725 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 if (length == 1)
11729 return PyBool_FromLong(
11730 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011732 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011733 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011734 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011736 for (i = 0; i < length; i++) {
11737 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011738 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011739 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011741 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742}
11743
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011744PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011746\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011747Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011748and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011749
11750static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011751unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011752{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 Py_ssize_t i, length;
11754 int kind;
11755 void *data;
11756
11757 if (PyUnicode_READY(self) == -1)
11758 return NULL;
11759 length = PyUnicode_GET_LENGTH(self);
11760 kind = PyUnicode_KIND(self);
11761 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011762
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011763 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 if (length == 1)
11765 return PyBool_FromLong(
11766 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011767
11768 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011770 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 for (i = 0; i < length; i++) {
11773 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011775 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011776 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011777}
11778
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011779PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011780 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011781\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011782Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011783and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011784
11785static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011786unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011787{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 int kind;
11789 void *data;
11790 Py_ssize_t len, i;
11791
11792 if (PyUnicode_READY(self) == -1)
11793 return NULL;
11794
11795 kind = PyUnicode_KIND(self);
11796 data = PyUnicode_DATA(self);
11797 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011798
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011799 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 if (len == 1) {
11801 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11802 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11803 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011804
11805 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011807 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 for (i = 0; i < len; i++) {
11810 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011811 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011812 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011813 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011814 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011815}
11816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011817PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011818 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011820Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011821False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822
11823static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011824unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 Py_ssize_t i, length;
11827 int kind;
11828 void *data;
11829
11830 if (PyUnicode_READY(self) == -1)
11831 return NULL;
11832 length = PyUnicode_GET_LENGTH(self);
11833 kind = PyUnicode_KIND(self);
11834 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 if (length == 1)
11838 return PyBool_FromLong(
11839 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011841 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011843 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 for (i = 0; i < length; i++) {
11846 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011847 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011849 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850}
11851
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011852PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011853 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011855Return True if all characters in S are digits\n\
11856and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857
11858static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011859unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861 Py_ssize_t i, length;
11862 int kind;
11863 void *data;
11864
11865 if (PyUnicode_READY(self) == -1)
11866 return NULL;
11867 length = PyUnicode_GET_LENGTH(self);
11868 kind = PyUnicode_KIND(self);
11869 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 if (length == 1) {
11873 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11874 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11875 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011877 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011879 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 for (i = 0; i < length; i++) {
11882 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011885 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886}
11887
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011888PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011889 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011891Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011892False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893
11894static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011895unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 Py_ssize_t i, length;
11898 int kind;
11899 void *data;
11900
11901 if (PyUnicode_READY(self) == -1)
11902 return NULL;
11903 length = PyUnicode_GET_LENGTH(self);
11904 kind = PyUnicode_KIND(self);
11905 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 if (length == 1)
11909 return PyBool_FromLong(
11910 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011912 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011914 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 for (i = 0; i < length; i++) {
11917 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011918 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011920 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921}
11922
Martin v. Löwis47383402007-08-15 07:32:56 +000011923int
11924PyUnicode_IsIdentifier(PyObject *self)
11925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 int kind;
11927 void *data;
11928 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011929 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 if (PyUnicode_READY(self) == -1) {
11932 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011933 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 }
11935
11936 /* Special case for empty strings */
11937 if (PyUnicode_GET_LENGTH(self) == 0)
11938 return 0;
11939 kind = PyUnicode_KIND(self);
11940 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011941
11942 /* PEP 3131 says that the first character must be in
11943 XID_Start and subsequent characters in XID_Continue,
11944 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011945 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011946 letters, digits, underscore). However, given the current
11947 definition of XID_Start and XID_Continue, it is sufficient
11948 to check just for these, except that _ must be allowed
11949 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011951 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011952 return 0;
11953
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011954 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011955 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011956 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011957 return 1;
11958}
11959
11960PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011961 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011962\n\
11963Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011964to the language definition.\n\
11965\n\
11966Use keyword.iskeyword() to test for reserved identifiers\n\
11967such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011968
11969static PyObject*
11970unicode_isidentifier(PyObject *self)
11971{
11972 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11973}
11974
Georg Brandl559e5d72008-06-11 18:37:52 +000011975PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011976 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011977\n\
11978Return True if all characters in S are considered\n\
11979printable in repr() or S is empty, False otherwise.");
11980
11981static PyObject*
11982unicode_isprintable(PyObject *self)
11983{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 Py_ssize_t i, length;
11985 int kind;
11986 void *data;
11987
11988 if (PyUnicode_READY(self) == -1)
11989 return NULL;
11990 length = PyUnicode_GET_LENGTH(self);
11991 kind = PyUnicode_KIND(self);
11992 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011993
11994 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 if (length == 1)
11996 return PyBool_FromLong(
11997 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 for (i = 0; i < length; i++) {
12000 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012001 Py_RETURN_FALSE;
12002 }
12003 }
12004 Py_RETURN_TRUE;
12005}
12006
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012007PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012008 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009\n\
12010Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012011iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012
12013static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012014unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012016 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012017}
12018
Martin v. Löwis18e16552006-02-15 17:27:45 +000012019static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012020unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 if (PyUnicode_READY(self) == -1)
12023 return -1;
12024 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012025}
12026
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012027PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012028 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012030Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012031done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032
12033static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012034unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012036 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 Py_UCS4 fillchar = ' ';
12038
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012039 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040 return NULL;
12041
Benjamin Petersonbac79492012-01-14 13:34:47 -050012042 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012043 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044
Victor Stinnerc4b49542011-12-11 22:44:26 +010012045 if (PyUnicode_GET_LENGTH(self) >= width)
12046 return unicode_result_unchanged(self);
12047
12048 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049}
12050
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012051PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012052 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012054Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055
12056static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012057unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012059 if (PyUnicode_READY(self) == -1)
12060 return NULL;
12061 if (PyUnicode_IS_ASCII(self))
12062 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012063 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064}
12065
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012066#define LEFTSTRIP 0
12067#define RIGHTSTRIP 1
12068#define BOTHSTRIP 2
12069
12070/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012071static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012072
12073#define STRIPNAME(i) (stripformat[i]+3)
12074
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012075/* externally visible for str.strip(unicode) */
12076PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012077_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012078{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 void *data;
12080 int kind;
12081 Py_ssize_t i, j, len;
12082 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012083 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12086 return NULL;
12087
12088 kind = PyUnicode_KIND(self);
12089 data = PyUnicode_DATA(self);
12090 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012091 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12093 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012094 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012095
Benjamin Peterson14339b62009-01-31 16:36:08 +000012096 i = 0;
12097 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012098 while (i < len) {
12099 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12100 if (!BLOOM(sepmask, ch))
12101 break;
12102 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12103 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012104 i++;
12105 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012106 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012107
Benjamin Peterson14339b62009-01-31 16:36:08 +000012108 j = len;
12109 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012110 j--;
12111 while (j >= i) {
12112 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12113 if (!BLOOM(sepmask, ch))
12114 break;
12115 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12116 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012117 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012118 }
12119
Benjamin Peterson29060642009-01-31 22:14:21 +000012120 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012121 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012122
Victor Stinner7931d9a2011-11-04 00:22:48 +010012123 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124}
12125
12126PyObject*
12127PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12128{
12129 unsigned char *data;
12130 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012131 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132
Victor Stinnerde636f32011-10-01 03:55:54 +020012133 if (PyUnicode_READY(self) == -1)
12134 return NULL;
12135
Victor Stinner684d5fd2012-05-03 02:32:34 +020012136 length = PyUnicode_GET_LENGTH(self);
12137 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012138
Victor Stinner684d5fd2012-05-03 02:32:34 +020012139 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012140 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141
Victor Stinnerde636f32011-10-01 03:55:54 +020012142 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012143 PyErr_SetString(PyExc_IndexError, "string index out of range");
12144 return NULL;
12145 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012146 if (start >= length || end < start)
12147 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012148
Victor Stinner684d5fd2012-05-03 02:32:34 +020012149 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012150 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012151 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012152 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012153 }
12154 else {
12155 kind = PyUnicode_KIND(self);
12156 data = PyUnicode_1BYTE_DATA(self);
12157 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012158 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012159 length);
12160 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162
12163static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012164do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012166 Py_ssize_t len, i, j;
12167
12168 if (PyUnicode_READY(self) == -1)
12169 return NULL;
12170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012171 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012172
Victor Stinnercc7af722013-04-09 22:39:24 +020012173 if (PyUnicode_IS_ASCII(self)) {
12174 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12175
12176 i = 0;
12177 if (striptype != RIGHTSTRIP) {
12178 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012179 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012180 if (!_Py_ascii_whitespace[ch])
12181 break;
12182 i++;
12183 }
12184 }
12185
12186 j = len;
12187 if (striptype != LEFTSTRIP) {
12188 j--;
12189 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012190 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012191 if (!_Py_ascii_whitespace[ch])
12192 break;
12193 j--;
12194 }
12195 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012196 }
12197 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012198 else {
12199 int kind = PyUnicode_KIND(self);
12200 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012201
Victor Stinnercc7af722013-04-09 22:39:24 +020012202 i = 0;
12203 if (striptype != RIGHTSTRIP) {
12204 while (i < len) {
12205 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12206 if (!Py_UNICODE_ISSPACE(ch))
12207 break;
12208 i++;
12209 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012210 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012211
12212 j = len;
12213 if (striptype != LEFTSTRIP) {
12214 j--;
12215 while (j >= i) {
12216 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12217 if (!Py_UNICODE_ISSPACE(ch))
12218 break;
12219 j--;
12220 }
12221 j++;
12222 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012223 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012224
Victor Stinner7931d9a2011-11-04 00:22:48 +010012225 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226}
12227
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012228
12229static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012230do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012231{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012232 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012233
Serhiy Storchakac6792272013-10-19 21:03:34 +030012234 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012235 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012236
Benjamin Peterson14339b62009-01-31 16:36:08 +000012237 if (sep != NULL && sep != Py_None) {
12238 if (PyUnicode_Check(sep))
12239 return _PyUnicode_XStrip(self, striptype, sep);
12240 else {
12241 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012242 "%s arg must be None or str",
12243 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012244 return NULL;
12245 }
12246 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012247
Benjamin Peterson14339b62009-01-31 16:36:08 +000012248 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012249}
12250
12251
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012252PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012253 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012254\n\
12255Return a copy of the string S with leading and trailing\n\
12256whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012257If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012258
12259static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012260unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012261{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012262 if (PyTuple_GET_SIZE(args) == 0)
12263 return do_strip(self, BOTHSTRIP); /* Common case */
12264 else
12265 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012266}
12267
12268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012269PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012270 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012271\n\
12272Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012273If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012274
12275static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012276unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012277{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012278 if (PyTuple_GET_SIZE(args) == 0)
12279 return do_strip(self, LEFTSTRIP); /* Common case */
12280 else
12281 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012282}
12283
12284
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012285PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012286 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012287\n\
12288Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012289If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012290
12291static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012292unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012293{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012294 if (PyTuple_GET_SIZE(args) == 0)
12295 return do_strip(self, RIGHTSTRIP); /* Common case */
12296 else
12297 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012298}
12299
12300
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012302unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012304 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306
Serhiy Storchaka05997252013-01-26 12:14:02 +020012307 if (len < 1)
12308 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309
Victor Stinnerc4b49542011-12-11 22:44:26 +010012310 /* no repeat, return original string */
12311 if (len == 1)
12312 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012313
Benjamin Petersonbac79492012-01-14 13:34:47 -050012314 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 return NULL;
12316
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012317 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012318 PyErr_SetString(PyExc_OverflowError,
12319 "repeated string is too long");
12320 return NULL;
12321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012323
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012324 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012325 if (!u)
12326 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012327 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 if (PyUnicode_GET_LENGTH(str) == 1) {
12330 const int kind = PyUnicode_KIND(str);
12331 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012332 if (kind == PyUnicode_1BYTE_KIND) {
12333 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012334 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012335 }
12336 else if (kind == PyUnicode_2BYTE_KIND) {
12337 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012338 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012339 ucs2[n] = fill_char;
12340 } else {
12341 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12342 assert(kind == PyUnicode_4BYTE_KIND);
12343 for (n = 0; n < len; ++n)
12344 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012345 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 }
12347 else {
12348 /* number of characters copied this far */
12349 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012350 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 char *to = (char *) PyUnicode_DATA(u);
12352 Py_MEMCPY(to, PyUnicode_DATA(str),
12353 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012354 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 n = (done <= nchars-done) ? done : nchars-done;
12356 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012357 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359 }
12360
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012361 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012362 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363}
12364
Alexander Belopolsky40018472011-02-26 01:02:56 +000012365PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012366PyUnicode_Replace(PyObject *str,
12367 PyObject *substr,
12368 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012369 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012371 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12372 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012373 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012374 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012375}
12376
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012377PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012378 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379\n\
12380Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012381old replaced by new. If the optional argument count is\n\
12382given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383
12384static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012385unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 PyObject *str1;
12388 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012389 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012390
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012391 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012392 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012393 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012394 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012395 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012396}
12397
Alexander Belopolsky40018472011-02-26 01:02:56 +000012398static PyObject *
12399unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012401 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012402 Py_ssize_t isize;
12403 Py_ssize_t osize, squote, dquote, i, o;
12404 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012405 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012406 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012408 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012409 return NULL;
12410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 isize = PyUnicode_GET_LENGTH(unicode);
12412 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012414 /* Compute length of output, quote characters, and
12415 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012416 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 max = 127;
12418 squote = dquote = 0;
12419 ikind = PyUnicode_KIND(unicode);
12420 for (i = 0; i < isize; i++) {
12421 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012422 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012424 case '\'': squote++; break;
12425 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012427 incr = 2;
12428 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429 default:
12430 /* Fast-path ASCII */
12431 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012432 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012434 ;
12435 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012436 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012438 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012440 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012442 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012444 if (osize > PY_SSIZE_T_MAX - incr) {
12445 PyErr_SetString(PyExc_OverflowError,
12446 "string is too long to generate repr");
12447 return NULL;
12448 }
12449 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012450 }
12451
12452 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012453 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012454 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012455 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 if (dquote)
12457 /* Both squote and dquote present. Use squote,
12458 and escape them */
12459 osize += squote;
12460 else
12461 quote = '"';
12462 }
Victor Stinner55c08782013-04-14 18:45:39 +020012463 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464
12465 repr = PyUnicode_New(osize, max);
12466 if (repr == NULL)
12467 return NULL;
12468 okind = PyUnicode_KIND(repr);
12469 odata = PyUnicode_DATA(repr);
12470
12471 PyUnicode_WRITE(okind, odata, 0, quote);
12472 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012473 if (unchanged) {
12474 _PyUnicode_FastCopyCharacters(repr, 1,
12475 unicode, 0,
12476 isize);
12477 }
12478 else {
12479 for (i = 0, o = 1; i < isize; i++) {
12480 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012481
Victor Stinner55c08782013-04-14 18:45:39 +020012482 /* Escape quotes and backslashes */
12483 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012484 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012485 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012486 continue;
12487 }
12488
12489 /* Map special whitespace to '\t', \n', '\r' */
12490 if (ch == '\t') {
12491 PyUnicode_WRITE(okind, odata, o++, '\\');
12492 PyUnicode_WRITE(okind, odata, o++, 't');
12493 }
12494 else if (ch == '\n') {
12495 PyUnicode_WRITE(okind, odata, o++, '\\');
12496 PyUnicode_WRITE(okind, odata, o++, 'n');
12497 }
12498 else if (ch == '\r') {
12499 PyUnicode_WRITE(okind, odata, o++, '\\');
12500 PyUnicode_WRITE(okind, odata, o++, 'r');
12501 }
12502
12503 /* Map non-printable US ASCII to '\xhh' */
12504 else if (ch < ' ' || ch == 0x7F) {
12505 PyUnicode_WRITE(okind, odata, o++, '\\');
12506 PyUnicode_WRITE(okind, odata, o++, 'x');
12507 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12508 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12509 }
12510
12511 /* Copy ASCII characters as-is */
12512 else if (ch < 0x7F) {
12513 PyUnicode_WRITE(okind, odata, o++, ch);
12514 }
12515
12516 /* Non-ASCII characters */
12517 else {
12518 /* Map Unicode whitespace and control characters
12519 (categories Z* and C* except ASCII space)
12520 */
12521 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12522 PyUnicode_WRITE(okind, odata, o++, '\\');
12523 /* Map 8-bit characters to '\xhh' */
12524 if (ch <= 0xff) {
12525 PyUnicode_WRITE(okind, odata, o++, 'x');
12526 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12527 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12528 }
12529 /* Map 16-bit characters to '\uxxxx' */
12530 else if (ch <= 0xffff) {
12531 PyUnicode_WRITE(okind, odata, o++, 'u');
12532 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12533 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12534 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12535 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12536 }
12537 /* Map 21-bit characters to '\U00xxxxxx' */
12538 else {
12539 PyUnicode_WRITE(okind, odata, o++, 'U');
12540 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12541 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12542 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12543 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12544 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12545 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12546 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12547 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12548 }
12549 }
12550 /* Copy characters as-is */
12551 else {
12552 PyUnicode_WRITE(okind, odata, o++, ch);
12553 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012554 }
12555 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012556 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012558 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012559 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560}
12561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012562PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012563 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564\n\
12565Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012566such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567arguments start and end are interpreted as in slice notation.\n\
12568\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012569Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570
12571static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012572unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012574 /* initialize variables to prevent gcc warning */
12575 PyObject *substring = NULL;
12576 Py_ssize_t start = 0;
12577 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012578 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012580 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012581 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012583 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012586 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588 if (result == -2)
12589 return NULL;
12590
Christian Heimes217cfd12007-12-02 14:31:20 +000012591 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592}
12593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012594PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012595 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012597Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598
12599static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012600unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012601{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012602 /* initialize variables to prevent gcc warning */
12603 PyObject *substring = NULL;
12604 Py_ssize_t start = 0;
12605 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012606 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012607
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012608 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012609 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012611 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012614 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616 if (result == -2)
12617 return NULL;
12618
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619 if (result < 0) {
12620 PyErr_SetString(PyExc_ValueError, "substring not found");
12621 return NULL;
12622 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623
Christian Heimes217cfd12007-12-02 14:31:20 +000012624 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625}
12626
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012627PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012628 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012630Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012631done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632
12633static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012634unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012636 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 Py_UCS4 fillchar = ' ';
12638
Victor Stinnere9a29352011-10-01 02:14:59 +020012639 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012641
Benjamin Petersonbac79492012-01-14 13:34:47 -050012642 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643 return NULL;
12644
Victor Stinnerc4b49542011-12-11 22:44:26 +010012645 if (PyUnicode_GET_LENGTH(self) >= width)
12646 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012647
Victor Stinnerc4b49542011-12-11 22:44:26 +010012648 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012649}
12650
Alexander Belopolsky40018472011-02-26 01:02:56 +000012651PyObject *
12652PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012653{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012654 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012657 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658}
12659
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012660PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012661 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662\n\
12663Return a list of the words in S, using sep as the\n\
12664delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012665splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012666whitespace string is a separator and empty strings are\n\
12667removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012668
12669static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012670unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012672 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012674 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012676 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12677 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678 return NULL;
12679
12680 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012681 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012682
12683 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012684 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012685
12686 PyErr_Format(PyExc_TypeError,
12687 "must be str or None, not %.100s",
12688 Py_TYPE(substring)->tp_name);
12689 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690}
12691
Thomas Wouters477c8d52006-05-27 19:21:47 +000012692PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012693PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012694{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012695 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012696 int kind1, kind2;
12697 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012698 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012699
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012700 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012701 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012702
Victor Stinner14f8f022011-10-05 20:58:25 +020012703 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705 len1 = PyUnicode_GET_LENGTH(str_obj);
12706 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012707 if (kind1 < kind2 || len1 < len2) {
12708 _Py_INCREF_UNICODE_EMPTY();
12709 if (!unicode_empty)
12710 out = NULL;
12711 else {
12712 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12713 Py_DECREF(unicode_empty);
12714 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012715 return out;
12716 }
12717 buf1 = PyUnicode_DATA(str_obj);
12718 buf2 = PyUnicode_DATA(sep_obj);
12719 if (kind2 != kind1) {
12720 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12721 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012722 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012723 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012725 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012727 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12728 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12729 else
12730 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012731 break;
12732 case PyUnicode_2BYTE_KIND:
12733 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12734 break;
12735 case PyUnicode_4BYTE_KIND:
12736 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12737 break;
12738 default:
12739 assert(0);
12740 out = 0;
12741 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012742
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012743 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012744 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012745
12746 return out;
12747}
12748
12749
12750PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012751PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012752{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012753 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012754 int kind1, kind2;
12755 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012756 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012757
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012758 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012759 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012760
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012761 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012762 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012763 len1 = PyUnicode_GET_LENGTH(str_obj);
12764 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012765 if (kind1 < kind2 || len1 < len2) {
12766 _Py_INCREF_UNICODE_EMPTY();
12767 if (!unicode_empty)
12768 out = NULL;
12769 else {
12770 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12771 Py_DECREF(unicode_empty);
12772 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012773 return out;
12774 }
12775 buf1 = PyUnicode_DATA(str_obj);
12776 buf2 = PyUnicode_DATA(sep_obj);
12777 if (kind2 != kind1) {
12778 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12779 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012780 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012781 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012783 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012784 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012785 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12786 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12787 else
12788 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012789 break;
12790 case PyUnicode_2BYTE_KIND:
12791 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12792 break;
12793 case PyUnicode_4BYTE_KIND:
12794 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12795 break;
12796 default:
12797 assert(0);
12798 out = 0;
12799 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012800
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012801 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012802 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012803
12804 return out;
12805}
12806
12807PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012808 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012809\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012810Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012811the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012812found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012813
12814static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012815unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012816{
Victor Stinner9310abb2011-10-05 00:59:23 +020012817 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012818}
12819
12820PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012821 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012822\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012823Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012824the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012825separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012826
12827static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012828unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012829{
Victor Stinner9310abb2011-10-05 00:59:23 +020012830 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012831}
12832
Alexander Belopolsky40018472011-02-26 01:02:56 +000012833PyObject *
12834PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012835{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012836 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012837 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012838
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012839 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012840}
12841
12842PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012843 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012844\n\
12845Return a list of the words in S, using sep as the\n\
12846delimiter string, starting at the end of the string and\n\
12847working to the front. If maxsplit is given, at most maxsplit\n\
12848splits are done. If sep is not specified, any whitespace string\n\
12849is a separator.");
12850
12851static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012852unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012853{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012854 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012855 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012856 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012857
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012858 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12859 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012860 return NULL;
12861
12862 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012863 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012864
12865 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012866 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012867
12868 PyErr_Format(PyExc_TypeError,
12869 "must be str or None, not %.100s",
12870 Py_TYPE(substring)->tp_name);
12871 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012872}
12873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012874PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012875 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876\n\
12877Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012878Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012879is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880
12881static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012882unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012884 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012885 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012887 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12888 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889 return NULL;
12890
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012891 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892}
12893
12894static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012895PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012896{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012897 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898}
12899
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012900PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012901 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902\n\
12903Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012904and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012905
12906static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012907unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012908{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012909 if (PyUnicode_READY(self) == -1)
12910 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012911 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012912}
12913
Larry Hastings61272b72014-01-07 12:41:53 -080012914/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012915
Larry Hastings31826802013-10-19 00:09:25 -070012916@staticmethod
12917str.maketrans as unicode_maketrans
12918
12919 x: object
12920
12921 y: unicode=NULL
12922
12923 z: unicode=NULL
12924
12925 /
12926
12927Return a translation table usable for str.translate().
12928
12929If there is only one argument, it must be a dictionary mapping Unicode
12930ordinals (integers) or characters to Unicode ordinals, strings or None.
12931Character keys will be then converted to ordinals.
12932If there are two arguments, they must be strings of equal length, and
12933in the resulting dictionary, each character in x will be mapped to the
12934character at the same position in y. If there is a third argument, it
12935must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012936[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012937
Larry Hastings31826802013-10-19 00:09:25 -070012938static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012939unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030012940/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012941{
Georg Brandlceee0772007-11-27 23:48:05 +000012942 PyObject *new = NULL, *key, *value;
12943 Py_ssize_t i = 0;
12944 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012945
Georg Brandlceee0772007-11-27 23:48:05 +000012946 new = PyDict_New();
12947 if (!new)
12948 return NULL;
12949 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012950 int x_kind, y_kind, z_kind;
12951 void *x_data, *y_data, *z_data;
12952
Georg Brandlceee0772007-11-27 23:48:05 +000012953 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012954 if (!PyUnicode_Check(x)) {
12955 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12956 "be a string if there is a second argument");
12957 goto err;
12958 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012959 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012960 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12961 "arguments must have equal length");
12962 goto err;
12963 }
12964 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965 x_kind = PyUnicode_KIND(x);
12966 y_kind = PyUnicode_KIND(y);
12967 x_data = PyUnicode_DATA(x);
12968 y_data = PyUnicode_DATA(y);
12969 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12970 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012971 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012972 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012973 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012974 if (!value) {
12975 Py_DECREF(key);
12976 goto err;
12977 }
Georg Brandlceee0772007-11-27 23:48:05 +000012978 res = PyDict_SetItem(new, key, value);
12979 Py_DECREF(key);
12980 Py_DECREF(value);
12981 if (res < 0)
12982 goto err;
12983 }
12984 /* create entries for deleting chars in z */
12985 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 z_kind = PyUnicode_KIND(z);
12987 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012988 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012989 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012990 if (!key)
12991 goto err;
12992 res = PyDict_SetItem(new, key, Py_None);
12993 Py_DECREF(key);
12994 if (res < 0)
12995 goto err;
12996 }
12997 }
12998 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012999 int kind;
13000 void *data;
13001
Georg Brandlceee0772007-11-27 23:48:05 +000013002 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013003 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013004 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13005 "to maketrans it must be a dict");
13006 goto err;
13007 }
13008 /* copy entries into the new dict, converting string keys to int keys */
13009 while (PyDict_Next(x, &i, &key, &value)) {
13010 if (PyUnicode_Check(key)) {
13011 /* convert string keys to integer keys */
13012 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013013 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013014 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13015 "table must be of length 1");
13016 goto err;
13017 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013018 kind = PyUnicode_KIND(key);
13019 data = PyUnicode_DATA(key);
13020 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013021 if (!newkey)
13022 goto err;
13023 res = PyDict_SetItem(new, newkey, value);
13024 Py_DECREF(newkey);
13025 if (res < 0)
13026 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013027 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013028 /* just keep integer keys */
13029 if (PyDict_SetItem(new, key, value) < 0)
13030 goto err;
13031 } else {
13032 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13033 "be strings or integers");
13034 goto err;
13035 }
13036 }
13037 }
13038 return new;
13039 err:
13040 Py_DECREF(new);
13041 return NULL;
13042}
13043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013044PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013045 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013046\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013047Return a copy of the string S in which each character has been mapped\n\
13048through the given translation table. The table must implement\n\
13049lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13050mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13051this operation raises LookupError, the character is left untouched.\n\
13052Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013053
13054static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013055unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013056{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013057 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013058}
13059
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013060PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013061 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013063Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013064
13065static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013066unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013068 if (PyUnicode_READY(self) == -1)
13069 return NULL;
13070 if (PyUnicode_IS_ASCII(self))
13071 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013072 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073}
13074
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013075PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013076 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013077\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013078Pad a numeric string S with zeros on the left, to fill a field\n\
13079of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013080
13081static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013082unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013083{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013084 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013085 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013086 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013087 int kind;
13088 void *data;
13089 Py_UCS4 chr;
13090
Martin v. Löwis18e16552006-02-15 17:27:45 +000013091 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092 return NULL;
13093
Benjamin Petersonbac79492012-01-14 13:34:47 -050013094 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013096
Victor Stinnerc4b49542011-12-11 22:44:26 +010013097 if (PyUnicode_GET_LENGTH(self) >= width)
13098 return unicode_result_unchanged(self);
13099
13100 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101
13102 u = pad(self, fill, 0, '0');
13103
Walter Dörwald068325e2002-04-15 13:36:47 +000013104 if (u == NULL)
13105 return NULL;
13106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013107 kind = PyUnicode_KIND(u);
13108 data = PyUnicode_DATA(u);
13109 chr = PyUnicode_READ(kind, data, fill);
13110
13111 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013113 PyUnicode_WRITE(kind, data, 0, chr);
13114 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115 }
13116
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013117 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013118 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120
13121#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013122static PyObject *
13123unicode__decimal2ascii(PyObject *self)
13124{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013125 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013126}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127#endif
13128
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013129PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013130 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013131\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013132Return True if S starts with the specified prefix, False otherwise.\n\
13133With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013134With optional end, stop comparing S at that position.\n\
13135prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136
13137static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013138unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013139 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013141 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013142 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013143 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013144 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013145 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013146
Jesus Ceaac451502011-04-20 17:09:23 +020013147 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013148 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013149 if (PyTuple_Check(subobj)) {
13150 Py_ssize_t i;
13151 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013152 substring = PyTuple_GET_ITEM(subobj, i);
13153 if (!PyUnicode_Check(substring)) {
13154 PyErr_Format(PyExc_TypeError,
13155 "tuple for startswith must only contain str, "
13156 "not %.100s",
13157 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013158 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013159 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013160 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013161 if (result == -1)
13162 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013163 if (result) {
13164 Py_RETURN_TRUE;
13165 }
13166 }
13167 /* nothing matched */
13168 Py_RETURN_FALSE;
13169 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013170 if (!PyUnicode_Check(subobj)) {
13171 PyErr_Format(PyExc_TypeError,
13172 "startswith first arg must be str or "
13173 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013174 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013175 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013176 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013177 if (result == -1)
13178 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013179 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013180}
13181
13182
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013183PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013184 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013185\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013186Return True if S ends with the specified suffix, False otherwise.\n\
13187With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013188With optional end, stop comparing S at that position.\n\
13189suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013190
13191static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013192unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013193 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013194{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013195 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013196 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013197 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013198 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013199 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013200
Jesus Ceaac451502011-04-20 17:09:23 +020013201 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013202 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013203 if (PyTuple_Check(subobj)) {
13204 Py_ssize_t i;
13205 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013206 substring = PyTuple_GET_ITEM(subobj, i);
13207 if (!PyUnicode_Check(substring)) {
13208 PyErr_Format(PyExc_TypeError,
13209 "tuple for endswith must only contain str, "
13210 "not %.100s",
13211 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013212 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013213 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013214 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013215 if (result == -1)
13216 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013217 if (result) {
13218 Py_RETURN_TRUE;
13219 }
13220 }
13221 Py_RETURN_FALSE;
13222 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013223 if (!PyUnicode_Check(subobj)) {
13224 PyErr_Format(PyExc_TypeError,
13225 "endswith first arg must be str or "
13226 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013227 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013228 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013229 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013230 if (result == -1)
13231 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013232 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013233}
13234
Victor Stinner202fdca2012-05-07 12:47:02 +020013235Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013236_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013237{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013238 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13239 writer->data = PyUnicode_DATA(writer->buffer);
13240
13241 if (!writer->readonly) {
13242 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013243 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013244 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013245 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013246 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13247 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13248 writer->kind = PyUnicode_WCHAR_KIND;
13249 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13250
Victor Stinner8f674cc2013-04-17 23:02:17 +020013251 /* Copy-on-write mode: set buffer size to 0 so
13252 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13253 * next write. */
13254 writer->size = 0;
13255 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013256}
13257
Victor Stinnerd3f08822012-05-29 12:57:52 +020013258void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013259_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013260{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013261 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013262
13263 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013264 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013265
13266 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13267 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13268 writer->kind = PyUnicode_WCHAR_KIND;
13269 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013270}
13271
Victor Stinnerd3f08822012-05-29 12:57:52 +020013272int
13273_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13274 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013275{
13276 Py_ssize_t newlen;
13277 PyObject *newbuffer;
13278
Victor Stinnerca9381e2015-09-22 00:58:32 +020013279 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013280 assert((maxchar > writer->maxchar && length >= 0)
13281 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013282
Victor Stinner202fdca2012-05-07 12:47:02 +020013283 if (length > PY_SSIZE_T_MAX - writer->pos) {
13284 PyErr_NoMemory();
13285 return -1;
13286 }
13287 newlen = writer->pos + length;
13288
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013289 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013290
Victor Stinnerd3f08822012-05-29 12:57:52 +020013291 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013292 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013293 if (writer->overallocate
13294 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13295 /* overallocate to limit the number of realloc() */
13296 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013297 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013298 if (newlen < writer->min_length)
13299 newlen = writer->min_length;
13300
Victor Stinnerd3f08822012-05-29 12:57:52 +020013301 writer->buffer = PyUnicode_New(newlen, maxchar);
13302 if (writer->buffer == NULL)
13303 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013304 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013305 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013306 if (writer->overallocate
13307 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13308 /* overallocate to limit the number of realloc() */
13309 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013310 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013311 if (newlen < writer->min_length)
13312 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013313
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013314 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013315 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013316 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013317 newbuffer = PyUnicode_New(newlen, maxchar);
13318 if (newbuffer == NULL)
13319 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013320 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13321 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013322 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013323 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013324 }
13325 else {
13326 newbuffer = resize_compact(writer->buffer, newlen);
13327 if (newbuffer == NULL)
13328 return -1;
13329 }
13330 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013331 }
13332 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013333 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013334 newbuffer = PyUnicode_New(writer->size, maxchar);
13335 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013336 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013337 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13338 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013339 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013340 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013341 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013342 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013343
13344#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013345}
13346
Victor Stinnerca9381e2015-09-22 00:58:32 +020013347int
13348_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13349 enum PyUnicode_Kind kind)
13350{
13351 Py_UCS4 maxchar;
13352
13353 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13354 assert(writer->kind < kind);
13355
13356 switch (kind)
13357 {
13358 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13359 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13360 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13361 default:
13362 assert(0 && "invalid kind");
13363 return -1;
13364 }
13365
13366 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13367}
13368
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013369Py_LOCAL_INLINE(int)
13370_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013371{
13372 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13373 return -1;
13374 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13375 writer->pos++;
13376 return 0;
13377}
13378
13379int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013380_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13381{
13382 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13383}
13384
13385int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013386_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13387{
13388 Py_UCS4 maxchar;
13389 Py_ssize_t len;
13390
13391 if (PyUnicode_READY(str) == -1)
13392 return -1;
13393 len = PyUnicode_GET_LENGTH(str);
13394 if (len == 0)
13395 return 0;
13396 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13397 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013398 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013399 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013400 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013401 Py_INCREF(str);
13402 writer->buffer = str;
13403 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013404 writer->pos += len;
13405 return 0;
13406 }
13407 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13408 return -1;
13409 }
13410 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13411 str, 0, len);
13412 writer->pos += len;
13413 return 0;
13414}
13415
Victor Stinnere215d962012-10-06 23:03:36 +020013416int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013417_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13418 Py_ssize_t start, Py_ssize_t end)
13419{
13420 Py_UCS4 maxchar;
13421 Py_ssize_t len;
13422
13423 if (PyUnicode_READY(str) == -1)
13424 return -1;
13425
13426 assert(0 <= start);
13427 assert(end <= PyUnicode_GET_LENGTH(str));
13428 assert(start <= end);
13429
13430 if (end == 0)
13431 return 0;
13432
13433 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13434 return _PyUnicodeWriter_WriteStr(writer, str);
13435
13436 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13437 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13438 else
13439 maxchar = writer->maxchar;
13440 len = end - start;
13441
13442 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13443 return -1;
13444
13445 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13446 str, start, len);
13447 writer->pos += len;
13448 return 0;
13449}
13450
13451int
Victor Stinner4a587072013-11-19 12:54:53 +010013452_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13453 const char *ascii, Py_ssize_t len)
13454{
13455 if (len == -1)
13456 len = strlen(ascii);
13457
13458 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13459
13460 if (writer->buffer == NULL && !writer->overallocate) {
13461 PyObject *str;
13462
13463 str = _PyUnicode_FromASCII(ascii, len);
13464 if (str == NULL)
13465 return -1;
13466
13467 writer->readonly = 1;
13468 writer->buffer = str;
13469 _PyUnicodeWriter_Update(writer);
13470 writer->pos += len;
13471 return 0;
13472 }
13473
13474 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13475 return -1;
13476
13477 switch (writer->kind)
13478 {
13479 case PyUnicode_1BYTE_KIND:
13480 {
13481 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13482 Py_UCS1 *data = writer->data;
13483
13484 Py_MEMCPY(data + writer->pos, str, len);
13485 break;
13486 }
13487 case PyUnicode_2BYTE_KIND:
13488 {
13489 _PyUnicode_CONVERT_BYTES(
13490 Py_UCS1, Py_UCS2,
13491 ascii, ascii + len,
13492 (Py_UCS2 *)writer->data + writer->pos);
13493 break;
13494 }
13495 case PyUnicode_4BYTE_KIND:
13496 {
13497 _PyUnicode_CONVERT_BYTES(
13498 Py_UCS1, Py_UCS4,
13499 ascii, ascii + len,
13500 (Py_UCS4 *)writer->data + writer->pos);
13501 break;
13502 }
13503 default:
13504 assert(0);
13505 }
13506
13507 writer->pos += len;
13508 return 0;
13509}
13510
13511int
13512_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13513 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013514{
13515 Py_UCS4 maxchar;
13516
13517 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13518 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13519 return -1;
13520 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13521 writer->pos += len;
13522 return 0;
13523}
13524
Victor Stinnerd3f08822012-05-29 12:57:52 +020013525PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013526_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013527{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013528 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013529 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013530 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013531 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013532 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013533 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013534 str = writer->buffer;
13535 writer->buffer = NULL;
13536 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13537 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013538 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013539 if (writer->pos == 0) {
13540 Py_CLEAR(writer->buffer);
13541
13542 /* Get the empty Unicode string singleton ('') */
13543 _Py_INCREF_UNICODE_EMPTY();
13544 str = unicode_empty;
Victor Stinner202fdca2012-05-07 12:47:02 +020013545 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013546 else {
13547 str = writer->buffer;
13548 writer->buffer = NULL;
13549
13550 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13551 PyObject *str2;
13552 str2 = resize_compact(str, writer->pos);
13553 if (str2 == NULL)
13554 return NULL;
13555 str = str2;
13556 }
13557 }
13558
Victor Stinner15a0bd32013-07-08 22:29:55 +020013559 assert(_PyUnicode_CheckConsistency(str, 1));
13560 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013561}
13562
Victor Stinnerd3f08822012-05-29 12:57:52 +020013563void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013564_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013565{
13566 Py_CLEAR(writer->buffer);
13567}
13568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013569#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013570
13571PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013572 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013573\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013574Return a formatted version of S, using substitutions from args and kwargs.\n\
13575The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013576
Eric Smith27bbca62010-11-04 17:06:58 +000013577PyDoc_STRVAR(format_map__doc__,
13578 "S.format_map(mapping) -> str\n\
13579\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013580Return a formatted version of S, using substitutions from mapping.\n\
13581The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013582
Eric Smith4a7d76d2008-05-30 18:10:19 +000013583static PyObject *
13584unicode__format__(PyObject* self, PyObject* args)
13585{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013586 PyObject *format_spec;
13587 _PyUnicodeWriter writer;
13588 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013589
13590 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13591 return NULL;
13592
Victor Stinnerd3f08822012-05-29 12:57:52 +020013593 if (PyUnicode_READY(self) == -1)
13594 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013595 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013596 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13597 self, format_spec, 0,
13598 PyUnicode_GET_LENGTH(format_spec));
13599 if (ret == -1) {
13600 _PyUnicodeWriter_Dealloc(&writer);
13601 return NULL;
13602 }
13603 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013604}
13605
Eric Smith8c663262007-08-25 02:26:07 +000013606PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013607 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013608\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013609Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013610
13611static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013612unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013613{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013614 Py_ssize_t size;
13615
13616 /* If it's a compact object, account for base structure +
13617 character data. */
13618 if (PyUnicode_IS_COMPACT_ASCII(v))
13619 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13620 else if (PyUnicode_IS_COMPACT(v))
13621 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013622 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013623 else {
13624 /* If it is a two-block object, account for base object, and
13625 for character block if present. */
13626 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013627 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013628 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013629 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013630 }
13631 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013632 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013633 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013634 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013635 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013636 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013637
13638 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013639}
13640
13641PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013642 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013643
13644static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013645unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013646{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013647 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013648 if (!copy)
13649 return NULL;
13650 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013651}
13652
Guido van Rossumd57fd912000-03-10 22:53:23 +000013653static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013654 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013655 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013656 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13657 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013658 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13659 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013660 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013661 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13662 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13663 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013664 {"expandtabs", (PyCFunction) unicode_expandtabs,
13665 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013666 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013667 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013668 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13669 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13670 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013671 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013672 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13673 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13674 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013675 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013676 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013677 {"splitlines", (PyCFunction) unicode_splitlines,
13678 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013679 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013680 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13681 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13682 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13683 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13684 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13685 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13686 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13687 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13688 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13689 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13690 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13691 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13692 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13693 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013694 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013695 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013696 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013697 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013698 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013699 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013700 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013701 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013702#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013703 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013704 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013705#endif
13706
Benjamin Peterson14339b62009-01-31 16:36:08 +000013707 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013708 {NULL, NULL}
13709};
13710
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013711static PyObject *
13712unicode_mod(PyObject *v, PyObject *w)
13713{
Brian Curtindfc80e32011-08-10 20:28:54 -050013714 if (!PyUnicode_Check(v))
13715 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013716 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013717}
13718
13719static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013720 0, /*nb_add*/
13721 0, /*nb_subtract*/
13722 0, /*nb_multiply*/
13723 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013724};
13725
Guido van Rossumd57fd912000-03-10 22:53:23 +000013726static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013727 (lenfunc) unicode_length, /* sq_length */
13728 PyUnicode_Concat, /* sq_concat */
13729 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13730 (ssizeargfunc) unicode_getitem, /* sq_item */
13731 0, /* sq_slice */
13732 0, /* sq_ass_item */
13733 0, /* sq_ass_slice */
13734 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013735};
13736
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013737static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013738unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013739{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013740 if (PyUnicode_READY(self) == -1)
13741 return NULL;
13742
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013743 if (PyIndex_Check(item)) {
13744 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013745 if (i == -1 && PyErr_Occurred())
13746 return NULL;
13747 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013748 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013749 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013750 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013751 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013752 PyObject *result;
13753 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013754 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013755 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013757 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013758 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013759 return NULL;
13760 }
13761
13762 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013763 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013764 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013765 slicelength == PyUnicode_GET_LENGTH(self)) {
13766 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013767 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013768 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013769 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013770 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013771 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013772 src_kind = PyUnicode_KIND(self);
13773 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013774 if (!PyUnicode_IS_ASCII(self)) {
13775 kind_limit = kind_maxchar_limit(src_kind);
13776 max_char = 0;
13777 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13778 ch = PyUnicode_READ(src_kind, src_data, cur);
13779 if (ch > max_char) {
13780 max_char = ch;
13781 if (max_char >= kind_limit)
13782 break;
13783 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013784 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013785 }
Victor Stinner55c99112011-10-13 01:17:06 +020013786 else
13787 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013788 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013789 if (result == NULL)
13790 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013791 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013792 dest_data = PyUnicode_DATA(result);
13793
13794 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013795 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13796 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013797 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013798 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013799 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013800 } else {
13801 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13802 return NULL;
13803 }
13804}
13805
13806static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013807 (lenfunc)unicode_length, /* mp_length */
13808 (binaryfunc)unicode_subscript, /* mp_subscript */
13809 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013810};
13811
Guido van Rossumd57fd912000-03-10 22:53:23 +000013812
Guido van Rossumd57fd912000-03-10 22:53:23 +000013813/* Helpers for PyUnicode_Format() */
13814
Victor Stinnera47082312012-10-04 02:19:54 +020013815struct unicode_formatter_t {
13816 PyObject *args;
13817 int args_owned;
13818 Py_ssize_t arglen, argidx;
13819 PyObject *dict;
13820
13821 enum PyUnicode_Kind fmtkind;
13822 Py_ssize_t fmtcnt, fmtpos;
13823 void *fmtdata;
13824 PyObject *fmtstr;
13825
13826 _PyUnicodeWriter writer;
13827};
13828
13829struct unicode_format_arg_t {
13830 Py_UCS4 ch;
13831 int flags;
13832 Py_ssize_t width;
13833 int prec;
13834 int sign;
13835};
13836
Guido van Rossumd57fd912000-03-10 22:53:23 +000013837static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013838unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013839{
Victor Stinnera47082312012-10-04 02:19:54 +020013840 Py_ssize_t argidx = ctx->argidx;
13841
13842 if (argidx < ctx->arglen) {
13843 ctx->argidx++;
13844 if (ctx->arglen < 0)
13845 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013846 else
Victor Stinnera47082312012-10-04 02:19:54 +020013847 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013848 }
13849 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013850 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013851 return NULL;
13852}
13853
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013854/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013855
Victor Stinnera47082312012-10-04 02:19:54 +020013856/* Format a float into the writer if the writer is not NULL, or into *p_output
13857 otherwise.
13858
13859 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013860static int
Victor Stinnera47082312012-10-04 02:19:54 +020013861formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13862 PyObject **p_output,
13863 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013864{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013865 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013866 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013867 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013868 int prec;
13869 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013870
Guido van Rossumd57fd912000-03-10 22:53:23 +000013871 x = PyFloat_AsDouble(v);
13872 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013873 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013874
Victor Stinnera47082312012-10-04 02:19:54 +020013875 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013876 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013877 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013878
Victor Stinnera47082312012-10-04 02:19:54 +020013879 if (arg->flags & F_ALT)
13880 dtoa_flags = Py_DTSF_ALT;
13881 else
13882 dtoa_flags = 0;
13883 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013884 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013885 return -1;
13886 len = strlen(p);
13887 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013888 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013889 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013890 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013891 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013892 }
13893 else
13894 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013895 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013896 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013897}
13898
Victor Stinnerd0880d52012-04-27 23:40:13 +020013899/* formatlong() emulates the format codes d, u, o, x and X, and
13900 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13901 * Python's regular ints.
13902 * Return value: a new PyUnicodeObject*, or NULL if error.
13903 * The output string is of the form
13904 * "-"? ("0x" | "0X")? digit+
13905 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13906 * set in flags. The case of hex digits will be correct,
13907 * There will be at least prec digits, zero-filled on the left if
13908 * necessary to get that many.
13909 * val object to be converted
13910 * flags bitmask of format flags; only F_ALT is looked at
13911 * prec minimum number of digits; 0-fill on left if needed
13912 * type a character in [duoxX]; u acts the same as d
13913 *
13914 * CAUTION: o, x and X conversions on regular ints can never
13915 * produce a '-' sign, but can for Python's unbounded ints.
13916 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013917PyObject *
13918_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013919{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013920 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013921 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013922 Py_ssize_t i;
13923 int sign; /* 1 if '-', else 0 */
13924 int len; /* number of characters */
13925 Py_ssize_t llen;
13926 int numdigits; /* len == numnondigits + numdigits */
13927 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013928
Victor Stinnerd0880d52012-04-27 23:40:13 +020013929 /* Avoid exceeding SSIZE_T_MAX */
13930 if (prec > INT_MAX-3) {
13931 PyErr_SetString(PyExc_OverflowError,
13932 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013933 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013934 }
13935
13936 assert(PyLong_Check(val));
13937
13938 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013939 default:
13940 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013941 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013942 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013943 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013944 /* int and int subclasses should print numerically when a numeric */
13945 /* format code is used (see issue18780) */
13946 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013947 break;
13948 case 'o':
13949 numnondigits = 2;
13950 result = PyNumber_ToBase(val, 8);
13951 break;
13952 case 'x':
13953 case 'X':
13954 numnondigits = 2;
13955 result = PyNumber_ToBase(val, 16);
13956 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013957 }
13958 if (!result)
13959 return NULL;
13960
13961 assert(unicode_modifiable(result));
13962 assert(PyUnicode_IS_READY(result));
13963 assert(PyUnicode_IS_ASCII(result));
13964
13965 /* To modify the string in-place, there can only be one reference. */
13966 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013967 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013968 PyErr_BadInternalCall();
13969 return NULL;
13970 }
13971 buf = PyUnicode_DATA(result);
13972 llen = PyUnicode_GET_LENGTH(result);
13973 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013974 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013975 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080013976 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013977 return NULL;
13978 }
13979 len = (int)llen;
13980 sign = buf[0] == '-';
13981 numnondigits += sign;
13982 numdigits = len - numnondigits;
13983 assert(numdigits > 0);
13984
13985 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013986 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013987 (type == 'o' || type == 'x' || type == 'X'))) {
13988 assert(buf[sign] == '0');
13989 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13990 buf[sign+1] == 'o');
13991 numnondigits -= 2;
13992 buf += 2;
13993 len -= 2;
13994 if (sign)
13995 buf[0] = '-';
13996 assert(len == numnondigits + numdigits);
13997 assert(numdigits > 0);
13998 }
13999
14000 /* Fill with leading zeroes to meet minimum width. */
14001 if (prec > numdigits) {
14002 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14003 numnondigits + prec);
14004 char *b1;
14005 if (!r1) {
14006 Py_DECREF(result);
14007 return NULL;
14008 }
14009 b1 = PyBytes_AS_STRING(r1);
14010 for (i = 0; i < numnondigits; ++i)
14011 *b1++ = *buf++;
14012 for (i = 0; i < prec - numdigits; i++)
14013 *b1++ = '0';
14014 for (i = 0; i < numdigits; i++)
14015 *b1++ = *buf++;
14016 *b1 = '\0';
14017 Py_DECREF(result);
14018 result = r1;
14019 buf = PyBytes_AS_STRING(result);
14020 len = numnondigits + prec;
14021 }
14022
14023 /* Fix up case for hex conversions. */
14024 if (type == 'X') {
14025 /* Need to convert all lower case letters to upper case.
14026 and need to convert 0x to 0X (and -0x to -0X). */
14027 for (i = 0; i < len; i++)
14028 if (buf[i] >= 'a' && buf[i] <= 'x')
14029 buf[i] -= 'a'-'A';
14030 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014031 if (!PyUnicode_Check(result)
14032 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014033 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014034 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014035 Py_DECREF(result);
14036 result = unicode;
14037 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014038 else if (len != PyUnicode_GET_LENGTH(result)) {
14039 if (PyUnicode_Resize(&result, len) < 0)
14040 Py_CLEAR(result);
14041 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014042 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014043}
14044
Ethan Furmandf3ed242014-01-05 06:50:30 -080014045/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014046 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014047 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014048 * -1 and raise an exception on error */
14049static int
Victor Stinnera47082312012-10-04 02:19:54 +020014050mainformatlong(PyObject *v,
14051 struct unicode_format_arg_t *arg,
14052 PyObject **p_output,
14053 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014054{
14055 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014056 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014057
14058 if (!PyNumber_Check(v))
14059 goto wrongtype;
14060
Ethan Furman9ab74802014-03-21 06:38:46 -070014061 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014062 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014063 if (type == 'o' || type == 'x' || type == 'X') {
14064 iobj = PyNumber_Index(v);
14065 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014066 if (PyErr_ExceptionMatches(PyExc_TypeError))
14067 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014068 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014069 }
14070 }
14071 else {
14072 iobj = PyNumber_Long(v);
14073 if (iobj == NULL ) {
14074 if (PyErr_ExceptionMatches(PyExc_TypeError))
14075 goto wrongtype;
14076 return -1;
14077 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014078 }
14079 assert(PyLong_Check(iobj));
14080 }
14081 else {
14082 iobj = v;
14083 Py_INCREF(iobj);
14084 }
14085
14086 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014087 && arg->width == -1 && arg->prec == -1
14088 && !(arg->flags & (F_SIGN | F_BLANK))
14089 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014090 {
14091 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014092 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014093 int base;
14094
Victor Stinnera47082312012-10-04 02:19:54 +020014095 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014096 {
14097 default:
14098 assert(0 && "'type' not in [diuoxX]");
14099 case 'd':
14100 case 'i':
14101 case 'u':
14102 base = 10;
14103 break;
14104 case 'o':
14105 base = 8;
14106 break;
14107 case 'x':
14108 case 'X':
14109 base = 16;
14110 break;
14111 }
14112
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014113 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14114 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014115 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014116 }
14117 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014118 return 1;
14119 }
14120
Ethan Furmanb95b5612015-01-23 20:05:18 -080014121 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014122 Py_DECREF(iobj);
14123 if (res == NULL)
14124 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014125 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014126 return 0;
14127
14128wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014129 switch(type)
14130 {
14131 case 'o':
14132 case 'x':
14133 case 'X':
14134 PyErr_Format(PyExc_TypeError,
14135 "%%%c format: an integer is required, "
14136 "not %.200s",
14137 type, Py_TYPE(v)->tp_name);
14138 break;
14139 default:
14140 PyErr_Format(PyExc_TypeError,
14141 "%%%c format: a number is required, "
14142 "not %.200s",
14143 type, Py_TYPE(v)->tp_name);
14144 break;
14145 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014146 return -1;
14147}
14148
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014149static Py_UCS4
14150formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014151{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014152 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014153 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014154 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014155 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014156 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014157 goto onError;
14158 }
14159 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014160 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014161 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014162 /* make sure number is a type of integer */
14163 if (!PyLong_Check(v)) {
14164 iobj = PyNumber_Index(v);
14165 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014166 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014167 }
14168 v = iobj;
14169 Py_DECREF(iobj);
14170 }
14171 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014172 x = PyLong_AsLong(v);
14173 if (x == -1 && PyErr_Occurred())
14174 goto onError;
14175
Victor Stinner8faf8212011-12-08 22:14:11 +010014176 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014177 PyErr_SetString(PyExc_OverflowError,
14178 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014179 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014180 }
14181
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014182 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014183 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014184
Benjamin Peterson29060642009-01-31 22:14:21 +000014185 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014186 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014187 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014188 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014189}
14190
Victor Stinnera47082312012-10-04 02:19:54 +020014191/* Parse options of an argument: flags, width, precision.
14192 Handle also "%(name)" syntax.
14193
14194 Return 0 if the argument has been formatted into arg->str.
14195 Return 1 if the argument has been written into ctx->writer,
14196 Raise an exception and return -1 on error. */
14197static int
14198unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14199 struct unicode_format_arg_t *arg)
14200{
14201#define FORMAT_READ(ctx) \
14202 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14203
14204 PyObject *v;
14205
Victor Stinnera47082312012-10-04 02:19:54 +020014206 if (arg->ch == '(') {
14207 /* Get argument value from a dictionary. Example: "%(name)s". */
14208 Py_ssize_t keystart;
14209 Py_ssize_t keylen;
14210 PyObject *key;
14211 int pcount = 1;
14212
14213 if (ctx->dict == NULL) {
14214 PyErr_SetString(PyExc_TypeError,
14215 "format requires a mapping");
14216 return -1;
14217 }
14218 ++ctx->fmtpos;
14219 --ctx->fmtcnt;
14220 keystart = ctx->fmtpos;
14221 /* Skip over balanced parentheses */
14222 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14223 arg->ch = FORMAT_READ(ctx);
14224 if (arg->ch == ')')
14225 --pcount;
14226 else if (arg->ch == '(')
14227 ++pcount;
14228 ctx->fmtpos++;
14229 }
14230 keylen = ctx->fmtpos - keystart - 1;
14231 if (ctx->fmtcnt < 0 || pcount > 0) {
14232 PyErr_SetString(PyExc_ValueError,
14233 "incomplete format key");
14234 return -1;
14235 }
14236 key = PyUnicode_Substring(ctx->fmtstr,
14237 keystart, keystart + keylen);
14238 if (key == NULL)
14239 return -1;
14240 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014241 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014242 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014243 }
14244 ctx->args = PyObject_GetItem(ctx->dict, key);
14245 Py_DECREF(key);
14246 if (ctx->args == NULL)
14247 return -1;
14248 ctx->args_owned = 1;
14249 ctx->arglen = -1;
14250 ctx->argidx = -2;
14251 }
14252
14253 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014254 while (--ctx->fmtcnt >= 0) {
14255 arg->ch = FORMAT_READ(ctx);
14256 ctx->fmtpos++;
14257 switch (arg->ch) {
14258 case '-': arg->flags |= F_LJUST; continue;
14259 case '+': arg->flags |= F_SIGN; continue;
14260 case ' ': arg->flags |= F_BLANK; continue;
14261 case '#': arg->flags |= F_ALT; continue;
14262 case '0': arg->flags |= F_ZERO; continue;
14263 }
14264 break;
14265 }
14266
14267 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014268 if (arg->ch == '*') {
14269 v = unicode_format_getnextarg(ctx);
14270 if (v == NULL)
14271 return -1;
14272 if (!PyLong_Check(v)) {
14273 PyErr_SetString(PyExc_TypeError,
14274 "* wants int");
14275 return -1;
14276 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014277 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014278 if (arg->width == -1 && PyErr_Occurred())
14279 return -1;
14280 if (arg->width < 0) {
14281 arg->flags |= F_LJUST;
14282 arg->width = -arg->width;
14283 }
14284 if (--ctx->fmtcnt >= 0) {
14285 arg->ch = FORMAT_READ(ctx);
14286 ctx->fmtpos++;
14287 }
14288 }
14289 else if (arg->ch >= '0' && arg->ch <= '9') {
14290 arg->width = arg->ch - '0';
14291 while (--ctx->fmtcnt >= 0) {
14292 arg->ch = FORMAT_READ(ctx);
14293 ctx->fmtpos++;
14294 if (arg->ch < '0' || arg->ch > '9')
14295 break;
14296 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14297 mixing signed and unsigned comparison. Since arg->ch is between
14298 '0' and '9', casting to int is safe. */
14299 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14300 PyErr_SetString(PyExc_ValueError,
14301 "width too big");
14302 return -1;
14303 }
14304 arg->width = arg->width*10 + (arg->ch - '0');
14305 }
14306 }
14307
14308 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014309 if (arg->ch == '.') {
14310 arg->prec = 0;
14311 if (--ctx->fmtcnt >= 0) {
14312 arg->ch = FORMAT_READ(ctx);
14313 ctx->fmtpos++;
14314 }
14315 if (arg->ch == '*') {
14316 v = unicode_format_getnextarg(ctx);
14317 if (v == NULL)
14318 return -1;
14319 if (!PyLong_Check(v)) {
14320 PyErr_SetString(PyExc_TypeError,
14321 "* wants int");
14322 return -1;
14323 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014324 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014325 if (arg->prec == -1 && PyErr_Occurred())
14326 return -1;
14327 if (arg->prec < 0)
14328 arg->prec = 0;
14329 if (--ctx->fmtcnt >= 0) {
14330 arg->ch = FORMAT_READ(ctx);
14331 ctx->fmtpos++;
14332 }
14333 }
14334 else if (arg->ch >= '0' && arg->ch <= '9') {
14335 arg->prec = arg->ch - '0';
14336 while (--ctx->fmtcnt >= 0) {
14337 arg->ch = FORMAT_READ(ctx);
14338 ctx->fmtpos++;
14339 if (arg->ch < '0' || arg->ch > '9')
14340 break;
14341 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14342 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014343 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014344 return -1;
14345 }
14346 arg->prec = arg->prec*10 + (arg->ch - '0');
14347 }
14348 }
14349 }
14350
14351 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14352 if (ctx->fmtcnt >= 0) {
14353 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14354 if (--ctx->fmtcnt >= 0) {
14355 arg->ch = FORMAT_READ(ctx);
14356 ctx->fmtpos++;
14357 }
14358 }
14359 }
14360 if (ctx->fmtcnt < 0) {
14361 PyErr_SetString(PyExc_ValueError,
14362 "incomplete format");
14363 return -1;
14364 }
14365 return 0;
14366
14367#undef FORMAT_READ
14368}
14369
14370/* Format one argument. Supported conversion specifiers:
14371
14372 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014373 - "i", "d", "u": int or float
14374 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014375 - "e", "E", "f", "F", "g", "G": float
14376 - "c": int or str (1 character)
14377
Victor Stinner8dbd4212012-12-04 09:30:24 +010014378 When possible, the output is written directly into the Unicode writer
14379 (ctx->writer). A string is created when padding is required.
14380
Victor Stinnera47082312012-10-04 02:19:54 +020014381 Return 0 if the argument has been formatted into *p_str,
14382 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014383 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014384static int
14385unicode_format_arg_format(struct unicode_formatter_t *ctx,
14386 struct unicode_format_arg_t *arg,
14387 PyObject **p_str)
14388{
14389 PyObject *v;
14390 _PyUnicodeWriter *writer = &ctx->writer;
14391
14392 if (ctx->fmtcnt == 0)
14393 ctx->writer.overallocate = 0;
14394
14395 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014396 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014397 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014398 return 1;
14399 }
14400
14401 v = unicode_format_getnextarg(ctx);
14402 if (v == NULL)
14403 return -1;
14404
Victor Stinnera47082312012-10-04 02:19:54 +020014405
14406 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014407 case 's':
14408 case 'r':
14409 case 'a':
14410 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14411 /* Fast path */
14412 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14413 return -1;
14414 return 1;
14415 }
14416
14417 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14418 *p_str = v;
14419 Py_INCREF(*p_str);
14420 }
14421 else {
14422 if (arg->ch == 's')
14423 *p_str = PyObject_Str(v);
14424 else if (arg->ch == 'r')
14425 *p_str = PyObject_Repr(v);
14426 else
14427 *p_str = PyObject_ASCII(v);
14428 }
14429 break;
14430
14431 case 'i':
14432 case 'd':
14433 case 'u':
14434 case 'o':
14435 case 'x':
14436 case 'X':
14437 {
14438 int ret = mainformatlong(v, arg, p_str, writer);
14439 if (ret != 0)
14440 return ret;
14441 arg->sign = 1;
14442 break;
14443 }
14444
14445 case 'e':
14446 case 'E':
14447 case 'f':
14448 case 'F':
14449 case 'g':
14450 case 'G':
14451 if (arg->width == -1 && arg->prec == -1
14452 && !(arg->flags & (F_SIGN | F_BLANK)))
14453 {
14454 /* Fast path */
14455 if (formatfloat(v, arg, NULL, writer) == -1)
14456 return -1;
14457 return 1;
14458 }
14459
14460 arg->sign = 1;
14461 if (formatfloat(v, arg, p_str, NULL) == -1)
14462 return -1;
14463 break;
14464
14465 case 'c':
14466 {
14467 Py_UCS4 ch = formatchar(v);
14468 if (ch == (Py_UCS4) -1)
14469 return -1;
14470 if (arg->width == -1 && arg->prec == -1) {
14471 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014472 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014473 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014474 return 1;
14475 }
14476 *p_str = PyUnicode_FromOrdinal(ch);
14477 break;
14478 }
14479
14480 default:
14481 PyErr_Format(PyExc_ValueError,
14482 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014483 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014484 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14485 (int)arg->ch,
14486 ctx->fmtpos - 1);
14487 return -1;
14488 }
14489 if (*p_str == NULL)
14490 return -1;
14491 assert (PyUnicode_Check(*p_str));
14492 return 0;
14493}
14494
14495static int
14496unicode_format_arg_output(struct unicode_formatter_t *ctx,
14497 struct unicode_format_arg_t *arg,
14498 PyObject *str)
14499{
14500 Py_ssize_t len;
14501 enum PyUnicode_Kind kind;
14502 void *pbuf;
14503 Py_ssize_t pindex;
14504 Py_UCS4 signchar;
14505 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014506 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014507 Py_ssize_t sublen;
14508 _PyUnicodeWriter *writer = &ctx->writer;
14509 Py_UCS4 fill;
14510
14511 fill = ' ';
14512 if (arg->sign && arg->flags & F_ZERO)
14513 fill = '0';
14514
14515 if (PyUnicode_READY(str) == -1)
14516 return -1;
14517
14518 len = PyUnicode_GET_LENGTH(str);
14519 if ((arg->width == -1 || arg->width <= len)
14520 && (arg->prec == -1 || arg->prec >= len)
14521 && !(arg->flags & (F_SIGN | F_BLANK)))
14522 {
14523 /* Fast path */
14524 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14525 return -1;
14526 return 0;
14527 }
14528
14529 /* Truncate the string for "s", "r" and "a" formats
14530 if the precision is set */
14531 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14532 if (arg->prec >= 0 && len > arg->prec)
14533 len = arg->prec;
14534 }
14535
14536 /* Adjust sign and width */
14537 kind = PyUnicode_KIND(str);
14538 pbuf = PyUnicode_DATA(str);
14539 pindex = 0;
14540 signchar = '\0';
14541 if (arg->sign) {
14542 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14543 if (ch == '-' || ch == '+') {
14544 signchar = ch;
14545 len--;
14546 pindex++;
14547 }
14548 else if (arg->flags & F_SIGN)
14549 signchar = '+';
14550 else if (arg->flags & F_BLANK)
14551 signchar = ' ';
14552 else
14553 arg->sign = 0;
14554 }
14555 if (arg->width < len)
14556 arg->width = len;
14557
14558 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014559 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014560 if (!(arg->flags & F_LJUST)) {
14561 if (arg->sign) {
14562 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014563 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014564 }
14565 else {
14566 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014567 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014568 }
14569 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014570 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14571 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014572 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014573 }
14574
Victor Stinnera47082312012-10-04 02:19:54 +020014575 buflen = arg->width;
14576 if (arg->sign && len == arg->width)
14577 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014578 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014579 return -1;
14580
14581 /* Write the sign if needed */
14582 if (arg->sign) {
14583 if (fill != ' ') {
14584 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14585 writer->pos += 1;
14586 }
14587 if (arg->width > len)
14588 arg->width--;
14589 }
14590
14591 /* Write the numeric prefix for "x", "X" and "o" formats
14592 if the alternate form is used.
14593 For example, write "0x" for the "%#x" format. */
14594 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14595 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14596 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14597 if (fill != ' ') {
14598 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14599 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14600 writer->pos += 2;
14601 pindex += 2;
14602 }
14603 arg->width -= 2;
14604 if (arg->width < 0)
14605 arg->width = 0;
14606 len -= 2;
14607 }
14608
14609 /* Pad left with the fill character if needed */
14610 if (arg->width > len && !(arg->flags & F_LJUST)) {
14611 sublen = arg->width - len;
14612 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14613 writer->pos += sublen;
14614 arg->width = len;
14615 }
14616
14617 /* If padding with spaces: write sign if needed and/or numeric prefix if
14618 the alternate form is used */
14619 if (fill == ' ') {
14620 if (arg->sign) {
14621 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14622 writer->pos += 1;
14623 }
14624 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14625 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14626 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14627 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14628 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14629 writer->pos += 2;
14630 pindex += 2;
14631 }
14632 }
14633
14634 /* Write characters */
14635 if (len) {
14636 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14637 str, pindex, len);
14638 writer->pos += len;
14639 }
14640
14641 /* Pad right with the fill character if needed */
14642 if (arg->width > len) {
14643 sublen = arg->width - len;
14644 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14645 writer->pos += sublen;
14646 }
14647 return 0;
14648}
14649
14650/* Helper of PyUnicode_Format(): format one arg.
14651 Return 0 on success, raise an exception and return -1 on error. */
14652static int
14653unicode_format_arg(struct unicode_formatter_t *ctx)
14654{
14655 struct unicode_format_arg_t arg;
14656 PyObject *str;
14657 int ret;
14658
Victor Stinner8dbd4212012-12-04 09:30:24 +010014659 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14660 arg.flags = 0;
14661 arg.width = -1;
14662 arg.prec = -1;
14663 arg.sign = 0;
14664 str = NULL;
14665
Victor Stinnera47082312012-10-04 02:19:54 +020014666 ret = unicode_format_arg_parse(ctx, &arg);
14667 if (ret == -1)
14668 return -1;
14669
14670 ret = unicode_format_arg_format(ctx, &arg, &str);
14671 if (ret == -1)
14672 return -1;
14673
14674 if (ret != 1) {
14675 ret = unicode_format_arg_output(ctx, &arg, str);
14676 Py_DECREF(str);
14677 if (ret == -1)
14678 return -1;
14679 }
14680
14681 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14682 PyErr_SetString(PyExc_TypeError,
14683 "not all arguments converted during string formatting");
14684 return -1;
14685 }
14686 return 0;
14687}
14688
Alexander Belopolsky40018472011-02-26 01:02:56 +000014689PyObject *
14690PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014691{
Victor Stinnera47082312012-10-04 02:19:54 +020014692 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014693
Guido van Rossumd57fd912000-03-10 22:53:23 +000014694 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014695 PyErr_BadInternalCall();
14696 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014697 }
Victor Stinnera47082312012-10-04 02:19:54 +020014698
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014699 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014700 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014701
14702 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014703 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14704 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14705 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14706 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014707
Victor Stinner8f674cc2013-04-17 23:02:17 +020014708 _PyUnicodeWriter_Init(&ctx.writer);
14709 ctx.writer.min_length = ctx.fmtcnt + 100;
14710 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014711
Guido van Rossumd57fd912000-03-10 22:53:23 +000014712 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014713 ctx.arglen = PyTuple_Size(args);
14714 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014715 }
14716 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014717 ctx.arglen = -1;
14718 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014719 }
Victor Stinnera47082312012-10-04 02:19:54 +020014720 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014721 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014722 ctx.dict = args;
14723 else
14724 ctx.dict = NULL;
14725 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014726
Victor Stinnera47082312012-10-04 02:19:54 +020014727 while (--ctx.fmtcnt >= 0) {
14728 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014729 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014730
14731 nonfmtpos = ctx.fmtpos++;
14732 while (ctx.fmtcnt >= 0 &&
14733 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14734 ctx.fmtpos++;
14735 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014736 }
Victor Stinnera47082312012-10-04 02:19:54 +020014737 if (ctx.fmtcnt < 0) {
14738 ctx.fmtpos--;
14739 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014740 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014741
Victor Stinnercfc4c132013-04-03 01:48:39 +020014742 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14743 nonfmtpos, ctx.fmtpos) < 0)
14744 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014745 }
14746 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014747 ctx.fmtpos++;
14748 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014749 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014750 }
14751 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014752
Victor Stinnera47082312012-10-04 02:19:54 +020014753 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014754 PyErr_SetString(PyExc_TypeError,
14755 "not all arguments converted during string formatting");
14756 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014757 }
14758
Victor Stinnera47082312012-10-04 02:19:54 +020014759 if (ctx.args_owned) {
14760 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014761 }
Victor Stinnera47082312012-10-04 02:19:54 +020014762 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014763
Benjamin Peterson29060642009-01-31 22:14:21 +000014764 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014765 _PyUnicodeWriter_Dealloc(&ctx.writer);
14766 if (ctx.args_owned) {
14767 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014768 }
14769 return NULL;
14770}
14771
Jeremy Hylton938ace62002-07-17 16:30:39 +000014772static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014773unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14774
Tim Peters6d6c1a32001-08-02 04:15:00 +000014775static PyObject *
14776unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14777{
Benjamin Peterson29060642009-01-31 22:14:21 +000014778 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014779 static char *kwlist[] = {"object", "encoding", "errors", 0};
14780 char *encoding = NULL;
14781 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014782
Benjamin Peterson14339b62009-01-31 16:36:08 +000014783 if (type != &PyUnicode_Type)
14784 return unicode_subtype_new(type, args, kwds);
14785 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014786 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014787 return NULL;
14788 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014789 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014790 if (encoding == NULL && errors == NULL)
14791 return PyObject_Str(x);
14792 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014793 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014794}
14795
Guido van Rossume023fe02001-08-30 03:12:59 +000014796static PyObject *
14797unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14798{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014799 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014800 Py_ssize_t length, char_size;
14801 int share_wstr, share_utf8;
14802 unsigned int kind;
14803 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014804
Benjamin Peterson14339b62009-01-31 16:36:08 +000014805 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014806
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014807 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014808 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014809 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014810 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014811 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014812 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014813 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014814 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014815
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014816 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014817 if (self == NULL) {
14818 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014819 return NULL;
14820 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014821 kind = PyUnicode_KIND(unicode);
14822 length = PyUnicode_GET_LENGTH(unicode);
14823
14824 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014825#ifdef Py_DEBUG
14826 _PyUnicode_HASH(self) = -1;
14827#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014828 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014829#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014830 _PyUnicode_STATE(self).interned = 0;
14831 _PyUnicode_STATE(self).kind = kind;
14832 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014833 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014834 _PyUnicode_STATE(self).ready = 1;
14835 _PyUnicode_WSTR(self) = NULL;
14836 _PyUnicode_UTF8_LENGTH(self) = 0;
14837 _PyUnicode_UTF8(self) = NULL;
14838 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014839 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014840
14841 share_utf8 = 0;
14842 share_wstr = 0;
14843 if (kind == PyUnicode_1BYTE_KIND) {
14844 char_size = 1;
14845 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14846 share_utf8 = 1;
14847 }
14848 else if (kind == PyUnicode_2BYTE_KIND) {
14849 char_size = 2;
14850 if (sizeof(wchar_t) == 2)
14851 share_wstr = 1;
14852 }
14853 else {
14854 assert(kind == PyUnicode_4BYTE_KIND);
14855 char_size = 4;
14856 if (sizeof(wchar_t) == 4)
14857 share_wstr = 1;
14858 }
14859
14860 /* Ensure we won't overflow the length. */
14861 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14862 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014863 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014864 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014865 data = PyObject_MALLOC((length + 1) * char_size);
14866 if (data == NULL) {
14867 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014868 goto onError;
14869 }
14870
Victor Stinnerc3c74152011-10-02 20:39:55 +020014871 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014872 if (share_utf8) {
14873 _PyUnicode_UTF8_LENGTH(self) = length;
14874 _PyUnicode_UTF8(self) = data;
14875 }
14876 if (share_wstr) {
14877 _PyUnicode_WSTR_LENGTH(self) = length;
14878 _PyUnicode_WSTR(self) = (wchar_t *)data;
14879 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014880
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014881 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014882 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014883 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014884#ifdef Py_DEBUG
14885 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14886#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014887 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014888 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014889
14890onError:
14891 Py_DECREF(unicode);
14892 Py_DECREF(self);
14893 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014894}
14895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014896PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014897"str(object='') -> str\n\
14898str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014899\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014900Create a new string object from the given object. If encoding or\n\
14901errors is specified, then the object must expose a data buffer\n\
14902that will be decoded using the given encoding and error handler.\n\
14903Otherwise, returns the result of object.__str__() (if defined)\n\
14904or repr(object).\n\
14905encoding defaults to sys.getdefaultencoding().\n\
14906errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014907
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014908static PyObject *unicode_iter(PyObject *seq);
14909
Guido van Rossumd57fd912000-03-10 22:53:23 +000014910PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014911 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014912 "str", /* tp_name */
14913 sizeof(PyUnicodeObject), /* tp_size */
14914 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014915 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014916 (destructor)unicode_dealloc, /* tp_dealloc */
14917 0, /* tp_print */
14918 0, /* tp_getattr */
14919 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014920 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014921 unicode_repr, /* tp_repr */
14922 &unicode_as_number, /* tp_as_number */
14923 &unicode_as_sequence, /* tp_as_sequence */
14924 &unicode_as_mapping, /* tp_as_mapping */
14925 (hashfunc) unicode_hash, /* tp_hash*/
14926 0, /* tp_call*/
14927 (reprfunc) unicode_str, /* tp_str */
14928 PyObject_GenericGetAttr, /* tp_getattro */
14929 0, /* tp_setattro */
14930 0, /* tp_as_buffer */
14931 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014932 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014933 unicode_doc, /* tp_doc */
14934 0, /* tp_traverse */
14935 0, /* tp_clear */
14936 PyUnicode_RichCompare, /* tp_richcompare */
14937 0, /* tp_weaklistoffset */
14938 unicode_iter, /* tp_iter */
14939 0, /* tp_iternext */
14940 unicode_methods, /* tp_methods */
14941 0, /* tp_members */
14942 0, /* tp_getset */
14943 &PyBaseObject_Type, /* tp_base */
14944 0, /* tp_dict */
14945 0, /* tp_descr_get */
14946 0, /* tp_descr_set */
14947 0, /* tp_dictoffset */
14948 0, /* tp_init */
14949 0, /* tp_alloc */
14950 unicode_new, /* tp_new */
14951 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014952};
14953
14954/* Initialize the Unicode implementation */
14955
Victor Stinner3a50e702011-10-18 21:21:00 +020014956int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014957{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014958 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014959 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014960 0x000A, /* LINE FEED */
14961 0x000D, /* CARRIAGE RETURN */
14962 0x001C, /* FILE SEPARATOR */
14963 0x001D, /* GROUP SEPARATOR */
14964 0x001E, /* RECORD SEPARATOR */
14965 0x0085, /* NEXT LINE */
14966 0x2028, /* LINE SEPARATOR */
14967 0x2029, /* PARAGRAPH SEPARATOR */
14968 };
14969
Fred Drakee4315f52000-05-09 19:53:39 +000014970 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014971 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014972 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014973 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014974 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014975
Guido van Rossumcacfc072002-05-24 19:01:59 +000014976 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014977 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014978
14979 /* initialize the linebreak bloom filter */
14980 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014981 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014982 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014983
Christian Heimes26532f72013-07-20 14:57:16 +020014984 if (PyType_Ready(&EncodingMapType) < 0)
14985 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014986
Benjamin Petersonc4311282012-10-30 23:21:10 -040014987 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14988 Py_FatalError("Can't initialize field name iterator type");
14989
14990 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14991 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014992
Victor Stinner3a50e702011-10-18 21:21:00 +020014993 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014994}
14995
14996/* Finalize the Unicode implementation */
14997
Christian Heimesa156e092008-02-16 07:38:31 +000014998int
14999PyUnicode_ClearFreeList(void)
15000{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015001 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015002}
15003
Guido van Rossumd57fd912000-03-10 22:53:23 +000015004void
Thomas Wouters78890102000-07-22 19:25:51 +000015005_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015006{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015007 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015008
Serhiy Storchaka05997252013-01-26 12:14:02 +020015009 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015010
Serhiy Storchaka05997252013-01-26 12:14:02 +020015011 for (i = 0; i < 256; i++)
15012 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015013 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015014 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015015}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015016
Walter Dörwald16807132007-05-25 13:52:07 +000015017void
15018PyUnicode_InternInPlace(PyObject **p)
15019{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015020 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015021 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015022#ifdef Py_DEBUG
15023 assert(s != NULL);
15024 assert(_PyUnicode_CHECK(s));
15025#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015026 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015027 return;
15028#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015029 /* If it's a subclass, we don't really know what putting
15030 it in the interned dict might do. */
15031 if (!PyUnicode_CheckExact(s))
15032 return;
15033 if (PyUnicode_CHECK_INTERNED(s))
15034 return;
15035 if (interned == NULL) {
15036 interned = PyDict_New();
15037 if (interned == NULL) {
15038 PyErr_Clear(); /* Don't leave an exception */
15039 return;
15040 }
15041 }
15042 /* It might be that the GetItem call fails even
15043 though the key is present in the dictionary,
15044 namely when this happens during a stack overflow. */
15045 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015046 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015047 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015048
Victor Stinnerf0335102013-04-14 19:13:03 +020015049 if (t) {
15050 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015051 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015052 return;
15053 }
Walter Dörwald16807132007-05-25 13:52:07 +000015054
Benjamin Peterson14339b62009-01-31 16:36:08 +000015055 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015056 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015057 PyErr_Clear();
15058 PyThreadState_GET()->recursion_critical = 0;
15059 return;
15060 }
15061 PyThreadState_GET()->recursion_critical = 0;
15062 /* The two references in interned are not counted by refcnt.
15063 The deallocator will take care of this */
15064 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015065 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015066}
15067
15068void
15069PyUnicode_InternImmortal(PyObject **p)
15070{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015071 PyUnicode_InternInPlace(p);
15072 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015073 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015074 Py_INCREF(*p);
15075 }
Walter Dörwald16807132007-05-25 13:52:07 +000015076}
15077
15078PyObject *
15079PyUnicode_InternFromString(const char *cp)
15080{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015081 PyObject *s = PyUnicode_FromString(cp);
15082 if (s == NULL)
15083 return NULL;
15084 PyUnicode_InternInPlace(&s);
15085 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015086}
15087
Alexander Belopolsky40018472011-02-26 01:02:56 +000015088void
15089_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015090{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015091 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015092 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015093 Py_ssize_t i, n;
15094 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015095
Benjamin Peterson14339b62009-01-31 16:36:08 +000015096 if (interned == NULL || !PyDict_Check(interned))
15097 return;
15098 keys = PyDict_Keys(interned);
15099 if (keys == NULL || !PyList_Check(keys)) {
15100 PyErr_Clear();
15101 return;
15102 }
Walter Dörwald16807132007-05-25 13:52:07 +000015103
Benjamin Peterson14339b62009-01-31 16:36:08 +000015104 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15105 detector, interned unicode strings are not forcibly deallocated;
15106 rather, we give them their stolen references back, and then clear
15107 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015108
Benjamin Peterson14339b62009-01-31 16:36:08 +000015109 n = PyList_GET_SIZE(keys);
15110 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015111 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015112 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015113 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015114 if (PyUnicode_READY(s) == -1) {
15115 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015116 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015117 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015118 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015119 case SSTATE_NOT_INTERNED:
15120 /* XXX Shouldn't happen */
15121 break;
15122 case SSTATE_INTERNED_IMMORTAL:
15123 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015124 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015125 break;
15126 case SSTATE_INTERNED_MORTAL:
15127 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015128 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015129 break;
15130 default:
15131 Py_FatalError("Inconsistent interned string state.");
15132 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015133 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015134 }
15135 fprintf(stderr, "total size of all interned strings: "
15136 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15137 "mortal/immortal\n", mortal_size, immortal_size);
15138 Py_DECREF(keys);
15139 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015140 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015141}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015142
15143
15144/********************* Unicode Iterator **************************/
15145
15146typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015147 PyObject_HEAD
15148 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015149 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015150} unicodeiterobject;
15151
15152static void
15153unicodeiter_dealloc(unicodeiterobject *it)
15154{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015155 _PyObject_GC_UNTRACK(it);
15156 Py_XDECREF(it->it_seq);
15157 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015158}
15159
15160static int
15161unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15162{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015163 Py_VISIT(it->it_seq);
15164 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015165}
15166
15167static PyObject *
15168unicodeiter_next(unicodeiterobject *it)
15169{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015170 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015171
Benjamin Peterson14339b62009-01-31 16:36:08 +000015172 assert(it != NULL);
15173 seq = it->it_seq;
15174 if (seq == NULL)
15175 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015176 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015178 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15179 int kind = PyUnicode_KIND(seq);
15180 void *data = PyUnicode_DATA(seq);
15181 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15182 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015183 if (item != NULL)
15184 ++it->it_index;
15185 return item;
15186 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015187
Benjamin Peterson14339b62009-01-31 16:36:08 +000015188 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015189 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015190 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015191}
15192
15193static PyObject *
15194unicodeiter_len(unicodeiterobject *it)
15195{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015196 Py_ssize_t len = 0;
15197 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015198 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015199 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015200}
15201
15202PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15203
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015204static PyObject *
15205unicodeiter_reduce(unicodeiterobject *it)
15206{
15207 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015208 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015209 it->it_seq, it->it_index);
15210 } else {
15211 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15212 if (u == NULL)
15213 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015214 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015215 }
15216}
15217
15218PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15219
15220static PyObject *
15221unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15222{
15223 Py_ssize_t index = PyLong_AsSsize_t(state);
15224 if (index == -1 && PyErr_Occurred())
15225 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015226 if (it->it_seq != NULL) {
15227 if (index < 0)
15228 index = 0;
15229 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15230 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15231 it->it_index = index;
15232 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015233 Py_RETURN_NONE;
15234}
15235
15236PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15237
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015238static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015239 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015240 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015241 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15242 reduce_doc},
15243 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15244 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015245 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015246};
15247
15248PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015249 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15250 "str_iterator", /* tp_name */
15251 sizeof(unicodeiterobject), /* tp_basicsize */
15252 0, /* tp_itemsize */
15253 /* methods */
15254 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15255 0, /* tp_print */
15256 0, /* tp_getattr */
15257 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015258 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015259 0, /* tp_repr */
15260 0, /* tp_as_number */
15261 0, /* tp_as_sequence */
15262 0, /* tp_as_mapping */
15263 0, /* tp_hash */
15264 0, /* tp_call */
15265 0, /* tp_str */
15266 PyObject_GenericGetAttr, /* tp_getattro */
15267 0, /* tp_setattro */
15268 0, /* tp_as_buffer */
15269 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15270 0, /* tp_doc */
15271 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15272 0, /* tp_clear */
15273 0, /* tp_richcompare */
15274 0, /* tp_weaklistoffset */
15275 PyObject_SelfIter, /* tp_iter */
15276 (iternextfunc)unicodeiter_next, /* tp_iternext */
15277 unicodeiter_methods, /* tp_methods */
15278 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015279};
15280
15281static PyObject *
15282unicode_iter(PyObject *seq)
15283{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015284 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015285
Benjamin Peterson14339b62009-01-31 16:36:08 +000015286 if (!PyUnicode_Check(seq)) {
15287 PyErr_BadInternalCall();
15288 return NULL;
15289 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015290 if (PyUnicode_READY(seq) == -1)
15291 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015292 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15293 if (it == NULL)
15294 return NULL;
15295 it->it_index = 0;
15296 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015297 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015298 _PyObject_GC_TRACK(it);
15299 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015300}
15301
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015302
15303size_t
15304Py_UNICODE_strlen(const Py_UNICODE *u)
15305{
15306 int res = 0;
15307 while(*u++)
15308 res++;
15309 return res;
15310}
15311
15312Py_UNICODE*
15313Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15314{
15315 Py_UNICODE *u = s1;
15316 while ((*u++ = *s2++));
15317 return s1;
15318}
15319
15320Py_UNICODE*
15321Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15322{
15323 Py_UNICODE *u = s1;
15324 while ((*u++ = *s2++))
15325 if (n-- == 0)
15326 break;
15327 return s1;
15328}
15329
15330Py_UNICODE*
15331Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15332{
15333 Py_UNICODE *u1 = s1;
15334 u1 += Py_UNICODE_strlen(u1);
15335 Py_UNICODE_strcpy(u1, s2);
15336 return s1;
15337}
15338
15339int
15340Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15341{
15342 while (*s1 && *s2 && *s1 == *s2)
15343 s1++, s2++;
15344 if (*s1 && *s2)
15345 return (*s1 < *s2) ? -1 : +1;
15346 if (*s1)
15347 return 1;
15348 if (*s2)
15349 return -1;
15350 return 0;
15351}
15352
15353int
15354Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15355{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015356 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015357 for (; n != 0; n--) {
15358 u1 = *s1;
15359 u2 = *s2;
15360 if (u1 != u2)
15361 return (u1 < u2) ? -1 : +1;
15362 if (u1 == '\0')
15363 return 0;
15364 s1++;
15365 s2++;
15366 }
15367 return 0;
15368}
15369
15370Py_UNICODE*
15371Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15372{
15373 const Py_UNICODE *p;
15374 for (p = s; *p; p++)
15375 if (*p == c)
15376 return (Py_UNICODE*)p;
15377 return NULL;
15378}
15379
15380Py_UNICODE*
15381Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15382{
15383 const Py_UNICODE *p;
15384 p = s + Py_UNICODE_strlen(s);
15385 while (p != s) {
15386 p--;
15387 if (*p == c)
15388 return (Py_UNICODE*)p;
15389 }
15390 return NULL;
15391}
Victor Stinner331ea922010-08-10 16:37:20 +000015392
Victor Stinner71133ff2010-09-01 23:43:53 +000015393Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015394PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015395{
Victor Stinner577db2c2011-10-11 22:12:48 +020015396 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015397 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015399 if (!PyUnicode_Check(unicode)) {
15400 PyErr_BadArgument();
15401 return NULL;
15402 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015403 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015404 if (u == NULL)
15405 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015406 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015407 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015408 PyErr_NoMemory();
15409 return NULL;
15410 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015411 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015412 size *= sizeof(Py_UNICODE);
15413 copy = PyMem_Malloc(size);
15414 if (copy == NULL) {
15415 PyErr_NoMemory();
15416 return NULL;
15417 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015418 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015419 return copy;
15420}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015421
Georg Brandl66c221e2010-10-14 07:04:07 +000015422/* A _string module, to export formatter_parser and formatter_field_name_split
15423 to the string.Formatter class implemented in Python. */
15424
15425static PyMethodDef _string_methods[] = {
15426 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15427 METH_O, PyDoc_STR("split the argument as a field name")},
15428 {"formatter_parser", (PyCFunction) formatter_parser,
15429 METH_O, PyDoc_STR("parse the argument as a format string")},
15430 {NULL, NULL}
15431};
15432
15433static struct PyModuleDef _string_module = {
15434 PyModuleDef_HEAD_INIT,
15435 "_string",
15436 PyDoc_STR("string helper module"),
15437 0,
15438 _string_methods,
15439 NULL,
15440 NULL,
15441 NULL,
15442 NULL
15443};
15444
15445PyMODINIT_FUNC
15446PyInit__string(void)
15447{
15448 return PyModule_Create(&_string_module);
15449}
15450
15451
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015452#ifdef __cplusplus
15453}
15454#endif