blob: b58cf02a8c04eb87811227cd05757dfe6a31a393 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700207static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200321 }
322 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200323 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 }
325 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200326 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200327 }
328 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200329 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200330 }
331 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200332 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200333 }
334 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200335 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 }
337 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200338 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200339 }
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_OTHER;
341}
342
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300343/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000345Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000346PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000348#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 /* This is actually an illegal character, so it should
352 not be passed to unichr. */
353 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354#endif
355}
356
Victor Stinner910337b2011-10-03 03:20:16 +0200357#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200358int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100359_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200360{
361 PyASCIIObject *ascii;
362 unsigned int kind;
363
364 assert(PyUnicode_Check(op));
365
366 ascii = (PyASCIIObject *)op;
367 kind = ascii->state.kind;
368
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200370 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 assert(ascii->state.ready == 1);
372 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200375 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200376
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 if (ascii->state.compact == 1) {
378 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200379 assert(kind == PyUnicode_1BYTE_KIND
380 || kind == PyUnicode_2BYTE_KIND
381 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200384 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100385 }
386 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389 data = unicode->data.any;
390 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100391 assert(ascii->length == 0);
392 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 assert(ascii->state.compact == 0);
394 assert(ascii->state.ascii == 0);
395 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200397 assert(ascii->wstr != NULL);
398 assert(data == NULL);
399 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 }
401 else {
402 assert(kind == PyUnicode_1BYTE_KIND
403 || kind == PyUnicode_2BYTE_KIND
404 || kind == PyUnicode_4BYTE_KIND);
405 assert(ascii->state.compact == 0);
406 assert(ascii->state.ready == 1);
407 assert(data != NULL);
408 if (ascii->state.ascii) {
409 assert (compact->utf8 == data);
410 assert (compact->utf8_length == ascii->length);
411 }
412 else
413 assert (compact->utf8 != data);
414 }
415 }
416 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200417 if (
418#if SIZEOF_WCHAR_T == 2
419 kind == PyUnicode_2BYTE_KIND
420#else
421 kind == PyUnicode_4BYTE_KIND
422#endif
423 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 {
425 assert(ascii->wstr == data);
426 assert(compact->wstr_length == ascii->length);
427 } else
428 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200429 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200430
431 if (compact->utf8 == NULL)
432 assert(compact->utf8_length == 0);
433 if (ascii->wstr == NULL)
434 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200435 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200436 /* check that the best kind is used */
437 if (check_content && kind != PyUnicode_WCHAR_KIND)
438 {
439 Py_ssize_t i;
440 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200441 void *data;
442 Py_UCS4 ch;
443
444 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 for (i=0; i < ascii->length; i++)
446 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200447 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200448 if (ch > maxchar)
449 maxchar = ch;
450 }
451 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100452 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 255);
455 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200456 else
457 assert(maxchar < 128);
458 }
Victor Stinner77faf692011-11-20 18:56:05 +0100459 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200460 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100461 assert(maxchar <= 0xFFFF);
462 }
463 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100465 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200467 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200468 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400469 return 1;
470}
Victor Stinner910337b2011-10-03 03:20:16 +0200471#endif
472
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100473static PyObject*
474unicode_result_wchar(PyObject *unicode)
475{
476#ifndef Py_DEBUG
477 Py_ssize_t len;
478
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100479 len = _PyUnicode_WSTR_LENGTH(unicode);
480 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200482 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100483 }
484
485 if (len == 1) {
486 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100487 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489 Py_DECREF(unicode);
490 return latin1_char;
491 }
492 }
493
494 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200495 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 return NULL;
497 }
498#else
Victor Stinneraa771272012-10-04 02:32:58 +0200499 assert(Py_REFCNT(unicode) == 1);
500
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 /* don't make the result ready in debug mode to ensure that the caller
502 makes the string ready before using it */
503 assert(_PyUnicode_CheckConsistency(unicode, 1));
504#endif
505 return unicode;
506}
507
508static PyObject*
509unicode_result_ready(PyObject *unicode)
510{
511 Py_ssize_t length;
512
513 length = PyUnicode_GET_LENGTH(unicode);
514 if (length == 0) {
515 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100516 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200517 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 }
519 return unicode_empty;
520 }
521
522 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200523 void *data = PyUnicode_DATA(unicode);
524 int kind = PyUnicode_KIND(unicode);
525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 if (ch < 256) {
527 PyObject *latin1_char = unicode_latin1[ch];
528 if (latin1_char != NULL) {
529 if (unicode != latin1_char) {
530 Py_INCREF(latin1_char);
531 Py_DECREF(unicode);
532 }
533 return latin1_char;
534 }
535 else {
536 assert(_PyUnicode_CheckConsistency(unicode, 1));
537 Py_INCREF(unicode);
538 unicode_latin1[ch] = unicode;
539 return unicode;
540 }
541 }
542 }
543
544 assert(_PyUnicode_CheckConsistency(unicode, 1));
545 return unicode;
546}
547
548static PyObject*
549unicode_result(PyObject *unicode)
550{
551 assert(_PyUnicode_CHECK(unicode));
552 if (PyUnicode_IS_READY(unicode))
553 return unicode_result_ready(unicode);
554 else
555 return unicode_result_wchar(unicode);
556}
557
Victor Stinnerc4b49542011-12-11 22:44:26 +0100558static PyObject*
559unicode_result_unchanged(PyObject *unicode)
560{
561 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500562 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563 return NULL;
564 Py_INCREF(unicode);
565 return unicode;
566 }
567 else
568 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100569 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100570}
571
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573 ASCII, Latin1, UTF-8, etc. */
574static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200575backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577{
Victor Stinnerad771582015-10-09 12:38:53 +0200578 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200579 Py_UCS4 ch;
580 enum PyUnicode_Kind kind;
581 void *data;
582
583 assert(PyUnicode_IS_READY(unicode));
584 kind = PyUnicode_KIND(unicode);
585 data = PyUnicode_DATA(unicode);
586
587 size = 0;
588 /* determine replacement size */
589 for (i = collstart; i < collend; ++i) {
590 Py_ssize_t incr;
591
592 ch = PyUnicode_READ(kind, data, i);
593 if (ch < 0x100)
594 incr = 2+2;
595 else if (ch < 0x10000)
596 incr = 2+4;
597 else {
598 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200599 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200600 }
601 if (size > PY_SSIZE_T_MAX - incr) {
602 PyErr_SetString(PyExc_OverflowError,
603 "encoded result is too long for a Python string");
604 return NULL;
605 }
606 size += incr;
607 }
608
Victor Stinnerad771582015-10-09 12:38:53 +0200609 str = _PyBytesWriter_Prepare(writer, str, size);
610 if (str == NULL)
611 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200612
613 /* generate replacement */
614 for (i = collstart; i < collend; ++i) {
615 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200616 *str++ = '\\';
617 if (ch >= 0x00010000) {
618 *str++ = 'U';
619 *str++ = Py_hexdigits[(ch>>28)&0xf];
620 *str++ = Py_hexdigits[(ch>>24)&0xf];
621 *str++ = Py_hexdigits[(ch>>20)&0xf];
622 *str++ = Py_hexdigits[(ch>>16)&0xf];
623 *str++ = Py_hexdigits[(ch>>12)&0xf];
624 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 }
Victor Stinner797485e2015-10-09 03:17:30 +0200626 else if (ch >= 0x100) {
627 *str++ = 'u';
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
630 }
631 else
632 *str++ = 'x';
633 *str++ = Py_hexdigits[(ch>>4)&0xf];
634 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200635 }
636 return str;
637}
638
639/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640 ASCII, Latin1, UTF-8, etc. */
641static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200642xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644{
Victor Stinnerad771582015-10-09 12:38:53 +0200645 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 Py_UCS4 ch;
647 enum PyUnicode_Kind kind;
648 void *data;
649
650 assert(PyUnicode_IS_READY(unicode));
651 kind = PyUnicode_KIND(unicode);
652 data = PyUnicode_DATA(unicode);
653
654 size = 0;
655 /* determine replacement size */
656 for (i = collstart; i < collend; ++i) {
657 Py_ssize_t incr;
658
659 ch = PyUnicode_READ(kind, data, i);
660 if (ch < 10)
661 incr = 2+1+1;
662 else if (ch < 100)
663 incr = 2+2+1;
664 else if (ch < 1000)
665 incr = 2+3+1;
666 else if (ch < 10000)
667 incr = 2+4+1;
668 else if (ch < 100000)
669 incr = 2+5+1;
670 else if (ch < 1000000)
671 incr = 2+6+1;
672 else {
673 assert(ch <= MAX_UNICODE);
674 incr = 2+7+1;
675 }
676 if (size > PY_SSIZE_T_MAX - incr) {
677 PyErr_SetString(PyExc_OverflowError,
678 "encoded result is too long for a Python string");
679 return NULL;
680 }
681 size += incr;
682 }
683
Victor Stinnerad771582015-10-09 12:38:53 +0200684 str = _PyBytesWriter_Prepare(writer, str, size);
685 if (str == NULL)
686 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200687
688 /* generate replacement */
689 for (i = collstart; i < collend; ++i) {
690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691 }
692 return str;
693}
694
Thomas Wouters477c8d52006-05-27 19:21:47 +0000695/* --- Bloom Filters ----------------------------------------------------- */
696
697/* stuff to implement simple "bloom filters" for Unicode characters.
698 to keep things simple, we use a single bitmask, using the least 5
699 bits from each unicode characters as the bit index. */
700
701/* the linebreak mask is set up by Unicode_Init below */
702
Antoine Pitrouf068f942010-01-13 14:19:12 +0000703#if LONG_BIT >= 128
704#define BLOOM_WIDTH 128
705#elif LONG_BIT >= 64
706#define BLOOM_WIDTH 64
707#elif LONG_BIT >= 32
708#define BLOOM_WIDTH 32
709#else
710#error "LONG_BIT is smaller than 32"
711#endif
712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713#define BLOOM_MASK unsigned long
714
Serhiy Storchaka05997252013-01-26 12:14:02 +0200715static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000716
Antoine Pitrouf068f942010-01-13 14:19:12 +0000717#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718
Benjamin Peterson29060642009-01-31 22:14:21 +0000719#define BLOOM_LINEBREAK(ch) \
720 ((ch) < 128U ? ascii_linebreak[(ch)] : \
721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000722
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700723static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000725{
Victor Stinnera85af502013-04-09 21:53:54 +0200726#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
727 do { \
728 TYPE *data = (TYPE *)PTR; \
729 TYPE *end = data + LEN; \
730 Py_UCS4 ch; \
731 for (; data != end; data++) { \
732 ch = *data; \
733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734 } \
735 break; \
736 } while (0)
737
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738 /* calculate simple bloom-style bitmask for a given unicode string */
739
Antoine Pitrouf068f942010-01-13 14:19:12 +0000740 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741
742 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200743 switch (kind) {
744 case PyUnicode_1BYTE_KIND:
745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746 break;
747 case PyUnicode_2BYTE_KIND:
748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749 break;
750 case PyUnicode_4BYTE_KIND:
751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752 break;
753 default:
754 assert(0);
755 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000756 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200757
758#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759}
760
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300761static int
762ensure_unicode(PyObject *obj)
763{
764 if (!PyUnicode_Check(obj)) {
765 PyErr_Format(PyExc_TypeError,
766 "must be str, not %.100s",
767 Py_TYPE(obj)->tp_name);
768 return -1;
769 }
770 return PyUnicode_READY(obj);
771}
772
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773/* Compilation of templated routines */
774
775#include "stringlib/asciilib.h"
776#include "stringlib/fastsearch.h"
777#include "stringlib/partition.h"
778#include "stringlib/split.h"
779#include "stringlib/count.h"
780#include "stringlib/find.h"
781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs1lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300791#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs2lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs4lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200818#include "stringlib/unicodedefs.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100822#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824/* --- Unicode Object ----------------------------------------------------- */
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200827fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700829static inline Py_ssize_t
830findchar(const void *s, int kind,
831 Py_ssize_t size, Py_UCS4 ch,
832 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834 switch (kind) {
835 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200836 if ((Py_UCS1) ch != ch)
837 return -1;
838 if (direction > 0)
839 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
840 else
841 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200842 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200843 if ((Py_UCS2) ch != ch)
844 return -1;
845 if (direction > 0)
846 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
847 else
848 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200849 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200850 if (direction > 0)
851 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
852 else
853 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200854 default:
855 assert(0);
856 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858}
859
Victor Stinnerafffce42012-10-03 23:03:17 +0200860#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000861/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200862 earlier.
863
864 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
865 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
866 invalid character in Unicode 6.0. */
867static void
868unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
869{
870 int kind = PyUnicode_KIND(unicode);
871 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
872 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
873 if (length <= old_length)
874 return;
875 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
876}
877#endif
878
Victor Stinnerfe226c02011-10-03 03:52:20 +0200879static PyObject*
880resize_compact(PyObject *unicode, Py_ssize_t length)
881{
882 Py_ssize_t char_size;
883 Py_ssize_t struct_size;
884 Py_ssize_t new_size;
885 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100886 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200887#ifdef Py_DEBUG
888 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
889#endif
890
Victor Stinner79891572012-05-03 13:43:07 +0200891 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200892 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100893 assert(PyUnicode_IS_COMPACT(unicode));
894
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200895 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100896 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200897 struct_size = sizeof(PyASCIIObject);
898 else
899 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200900 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901
Victor Stinnerfe226c02011-10-03 03:52:20 +0200902 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
903 PyErr_NoMemory();
904 return NULL;
905 }
906 new_size = (struct_size + (length + 1) * char_size);
907
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200908 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
909 PyObject_DEL(_PyUnicode_UTF8(unicode));
910 _PyUnicode_UTF8(unicode) = NULL;
911 _PyUnicode_UTF8_LENGTH(unicode) = 0;
912 }
Victor Stinner84def372011-12-11 20:04:56 +0100913 _Py_DEC_REFTOTAL;
914 _Py_ForgetReference(unicode);
915
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300916 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100917 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100918 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200919 PyErr_NoMemory();
920 return NULL;
921 }
Victor Stinner84def372011-12-11 20:04:56 +0100922 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200923 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100924
Victor Stinnerfe226c02011-10-03 03:52:20 +0200925 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200926 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200927 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100928 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200929 _PyUnicode_WSTR_LENGTH(unicode) = length;
930 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100931 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
932 PyObject_DEL(_PyUnicode_WSTR(unicode));
933 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100934 if (!PyUnicode_IS_ASCII(unicode))
935 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100936 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200937#ifdef Py_DEBUG
938 unicode_fill_invalid(unicode, old_length);
939#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
941 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200942 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 return unicode;
944}
945
Alexander Belopolsky40018472011-02-26 01:02:56 +0000946static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200947resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948{
Victor Stinner95663112011-10-04 01:03:50 +0200949 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100950 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200952 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000953
Victor Stinnerfe226c02011-10-03 03:52:20 +0200954 if (PyUnicode_IS_READY(unicode)) {
955 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200956 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200958#ifdef Py_DEBUG
959 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
960#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961
962 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200963 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
965 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200966
967 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
968 PyErr_NoMemory();
969 return -1;
970 }
971 new_size = (length + 1) * char_size;
972
Victor Stinner7a9105a2011-12-12 00:13:42 +0100973 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
974 {
975 PyObject_DEL(_PyUnicode_UTF8(unicode));
976 _PyUnicode_UTF8(unicode) = NULL;
977 _PyUnicode_UTF8_LENGTH(unicode) = 0;
978 }
979
Victor Stinnerfe226c02011-10-03 03:52:20 +0200980 data = (PyObject *)PyObject_REALLOC(data, new_size);
981 if (data == NULL) {
982 PyErr_NoMemory();
983 return -1;
984 }
985 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200986 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200987 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200988 _PyUnicode_WSTR_LENGTH(unicode) = length;
989 }
990 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200991 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200992 _PyUnicode_UTF8_LENGTH(unicode) = length;
993 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 _PyUnicode_LENGTH(unicode) = length;
995 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 unicode_fill_invalid(unicode, old_length);
998#endif
Victor Stinner95663112011-10-04 01:03:50 +0200999 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001000 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 }
Victor Stinner95663112011-10-04 01:03:50 +02001004 assert(_PyUnicode_WSTR(unicode) != NULL);
1005
1006 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001007 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001008 PyErr_NoMemory();
1009 return -1;
1010 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001012 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001013 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001014 if (!wstr) {
1015 PyErr_NoMemory();
1016 return -1;
1017 }
1018 _PyUnicode_WSTR(unicode) = wstr;
1019 _PyUnicode_WSTR(unicode)[length] = 0;
1020 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001021 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022 return 0;
1023}
1024
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025static PyObject*
1026resize_copy(PyObject *unicode, Py_ssize_t length)
1027{
1028 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001031
Benjamin Petersonbac79492012-01-14 13:34:47 -05001032 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001033 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001034
1035 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1036 if (copy == NULL)
1037 return NULL;
1038
1039 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001040 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001042 }
1043 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001044 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001046 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 if (w == NULL)
1048 return NULL;
1049 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1050 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001051 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001052 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001053 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 }
1055}
1056
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001058 Ux0000 terminated; some code (e.g. new_identifier)
1059 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060
1061 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001062 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063
1064*/
1065
Alexander Belopolsky40018472011-02-26 01:02:56 +00001066static PyUnicodeObject *
1067_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001069 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001071
Thomas Wouters477c8d52006-05-27 19:21:47 +00001072 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073 if (length == 0 && unicode_empty != NULL) {
1074 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001075 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 }
1077
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001078 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001079 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001080 return (PyUnicodeObject *)PyErr_NoMemory();
1081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 if (length < 0) {
1083 PyErr_SetString(PyExc_SystemError,
1084 "Negative size passed to _PyUnicode_New");
1085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086 }
1087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1089 if (unicode == NULL)
1090 return NULL;
1091 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001092
1093 _PyUnicode_WSTR_LENGTH(unicode) = length;
1094 _PyUnicode_HASH(unicode) = -1;
1095 _PyUnicode_STATE(unicode).interned = 0;
1096 _PyUnicode_STATE(unicode).kind = 0;
1097 _PyUnicode_STATE(unicode).compact = 0;
1098 _PyUnicode_STATE(unicode).ready = 0;
1099 _PyUnicode_STATE(unicode).ascii = 0;
1100 _PyUnicode_DATA_ANY(unicode) = NULL;
1101 _PyUnicode_LENGTH(unicode) = 0;
1102 _PyUnicode_UTF8(unicode) = NULL;
1103 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1106 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001107 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001108 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001109 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001110 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111
Jeremy Hyltond8082792003-09-16 19:41:39 +00001112 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001113 * the caller fails before initializing str -- unicode_resize()
1114 * reads str[0], and the Keep-Alive optimization can keep memory
1115 * allocated for str alive across a call to unicode_dealloc(unicode).
1116 * We don't want unicode_resize to read uninitialized memory in
1117 * that case.
1118 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 _PyUnicode_WSTR(unicode)[0] = 0;
1120 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001121
Victor Stinner7931d9a2011-11-04 00:22:48 +01001122 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 return unicode;
1124}
1125
Victor Stinnerf42dc442011-10-02 23:33:16 +02001126static const char*
1127unicode_kind_name(PyObject *unicode)
1128{
Victor Stinner42dfd712011-10-03 14:41:45 +02001129 /* don't check consistency: unicode_kind_name() is called from
1130 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131 if (!PyUnicode_IS_COMPACT(unicode))
1132 {
1133 if (!PyUnicode_IS_READY(unicode))
1134 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001135 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001136 {
1137 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001138 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001139 return "legacy ascii";
1140 else
1141 return "legacy latin1";
1142 case PyUnicode_2BYTE_KIND:
1143 return "legacy UCS2";
1144 case PyUnicode_4BYTE_KIND:
1145 return "legacy UCS4";
1146 default:
1147 return "<legacy invalid kind>";
1148 }
1149 }
1150 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001151 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "ascii";
1155 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001156 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001157 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001158 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001159 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001160 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001161 default:
1162 return "<invalid compact kind>";
1163 }
1164}
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167/* Functions wrapping macros for use in debugger */
1168char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001169 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170}
1171
1172void *_PyUnicode_compact_data(void *unicode) {
1173 return _PyUnicode_COMPACT_DATA(unicode);
1174}
1175void *_PyUnicode_data(void *unicode){
1176 printf("obj %p\n", unicode);
1177 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1178 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1179 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1180 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1181 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1182 return PyUnicode_DATA(unicode);
1183}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001184
1185void
1186_PyUnicode_Dump(PyObject *op)
1187{
1188 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001189 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1190 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1191 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001192
Victor Stinnera849a4b2011-10-03 12:12:11 +02001193 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001194 {
1195 if (ascii->state.ascii)
1196 data = (ascii + 1);
1197 else
1198 data = (compact + 1);
1199 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001200 else
1201 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001202 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1203 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001204
Victor Stinnera849a4b2011-10-03 12:12:11 +02001205 if (ascii->wstr == data)
1206 printf("shared ");
1207 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001208
Victor Stinnera3b334d2011-10-03 13:53:37 +02001209 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001210 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001211 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1212 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001213 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1214 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001215 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001216 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001217}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218#endif
1219
1220PyObject *
1221PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1222{
1223 PyObject *obj;
1224 PyCompactUnicodeObject *unicode;
1225 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001226 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001227 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 Py_ssize_t char_size;
1229 Py_ssize_t struct_size;
1230
1231 /* Optimization for empty strings */
1232 if (size == 0 && unicode_empty != NULL) {
1233 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001234 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 }
1236
Victor Stinner9e9d6892011-10-04 01:02:02 +02001237 is_ascii = 0;
1238 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239 struct_size = sizeof(PyCompactUnicodeObject);
1240 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001241 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 char_size = 1;
1243 is_ascii = 1;
1244 struct_size = sizeof(PyASCIIObject);
1245 }
1246 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001247 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248 char_size = 1;
1249 }
1250 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001251 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 char_size = 2;
1253 if (sizeof(wchar_t) == 2)
1254 is_sharing = 1;
1255 }
1256 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001257 if (maxchar > MAX_UNICODE) {
1258 PyErr_SetString(PyExc_SystemError,
1259 "invalid maximum character passed to PyUnicode_New");
1260 return NULL;
1261 }
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 4;
1264 if (sizeof(wchar_t) == 4)
1265 is_sharing = 1;
1266 }
1267
1268 /* Ensure we won't overflow the size. */
1269 if (size < 0) {
1270 PyErr_SetString(PyExc_SystemError,
1271 "Negative size passed to PyUnicode_New");
1272 return NULL;
1273 }
1274 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1275 return PyErr_NoMemory();
1276
1277 /* Duplicated allocation code from _PyObject_New() instead of a call to
1278 * PyObject_New() so we are able to allocate space for the object and
1279 * it's data buffer.
1280 */
1281 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1282 if (obj == NULL)
1283 return PyErr_NoMemory();
1284 obj = PyObject_INIT(obj, &PyUnicode_Type);
1285 if (obj == NULL)
1286 return NULL;
1287
1288 unicode = (PyCompactUnicodeObject *)obj;
1289 if (is_ascii)
1290 data = ((PyASCIIObject*)obj) + 1;
1291 else
1292 data = unicode + 1;
1293 _PyUnicode_LENGTH(unicode) = size;
1294 _PyUnicode_HASH(unicode) = -1;
1295 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001296 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 _PyUnicode_STATE(unicode).compact = 1;
1298 _PyUnicode_STATE(unicode).ready = 1;
1299 _PyUnicode_STATE(unicode).ascii = is_ascii;
1300 if (is_ascii) {
1301 ((char*)data)[size] = 0;
1302 _PyUnicode_WSTR(unicode) = NULL;
1303 }
Victor Stinner8f825062012-04-27 13:55:39 +02001304 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 ((char*)data)[size] = 0;
1306 _PyUnicode_WSTR(unicode) = NULL;
1307 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001309 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 else {
1312 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001313 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001314 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001316 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 ((Py_UCS4*)data)[size] = 0;
1318 if (is_sharing) {
1319 _PyUnicode_WSTR_LENGTH(unicode) = size;
1320 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1321 }
1322 else {
1323 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1324 _PyUnicode_WSTR(unicode) = NULL;
1325 }
1326 }
Victor Stinner8f825062012-04-27 13:55:39 +02001327#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001328 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001329#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001330 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 return obj;
1332}
1333
1334#if SIZEOF_WCHAR_T == 2
1335/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1336 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001337 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338
1339 This function assumes that unicode can hold one more code point than wstr
1340 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001341static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001343 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344{
1345 const wchar_t *iter;
1346 Py_UCS4 *ucs4_out;
1347
Victor Stinner910337b2011-10-03 03:20:16 +02001348 assert(unicode != NULL);
1349 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1351 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1352
1353 for (iter = begin; iter < end; ) {
1354 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1355 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001356 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1357 && (iter+1) < end
1358 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 {
Victor Stinner551ac952011-11-29 22:58:13 +01001360 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 iter += 2;
1362 }
1363 else {
1364 *ucs4_out++ = *iter;
1365 iter++;
1366 }
1367 }
1368 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1369 _PyUnicode_GET_LENGTH(unicode)));
1370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371}
1372#endif
1373
Victor Stinnercd9950f2011-10-02 00:34:53 +02001374static int
Victor Stinner488fa492011-12-12 00:01:39 +01001375unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001376{
Victor Stinner488fa492011-12-12 00:01:39 +01001377 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001378 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001379 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001380 return -1;
1381 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001382 return 0;
1383}
1384
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001385static int
1386_copy_characters(PyObject *to, Py_ssize_t to_start,
1387 PyObject *from, Py_ssize_t from_start,
1388 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001390 unsigned int from_kind, to_kind;
1391 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392
Victor Stinneree4544c2012-05-09 22:24:08 +02001393 assert(0 <= how_many);
1394 assert(0 <= from_start);
1395 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001397 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001398 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399
Victor Stinnerd3f08822012-05-29 12:57:52 +02001400 assert(PyUnicode_Check(to));
1401 assert(PyUnicode_IS_READY(to));
1402 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1403
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001404 if (how_many == 0)
1405 return 0;
1406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001408 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001410 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411
Victor Stinnerf1852262012-06-16 16:38:26 +02001412#ifdef Py_DEBUG
1413 if (!check_maxchar
1414 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1415 {
1416 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1417 Py_UCS4 ch;
1418 Py_ssize_t i;
1419 for (i=0; i < how_many; i++) {
1420 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1421 assert(ch <= to_maxchar);
1422 }
1423 }
1424#endif
1425
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001426 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001427 if (check_maxchar
1428 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1429 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001430 /* Writing Latin-1 characters into an ASCII string requires to
1431 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001432 Py_UCS4 max_char;
1433 max_char = ucs1lib_find_max_char(from_data,
1434 (Py_UCS1*)from_data + how_many);
1435 if (max_char >= 128)
1436 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001437 }
Christian Heimesf051e432016-09-13 20:22:02 +02001438 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001439 (char*)from_data + from_kind * from_start,
1440 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001442 else if (from_kind == PyUnicode_1BYTE_KIND
1443 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001444 {
1445 _PyUnicode_CONVERT_BYTES(
1446 Py_UCS1, Py_UCS2,
1447 PyUnicode_1BYTE_DATA(from) + from_start,
1448 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1449 PyUnicode_2BYTE_DATA(to) + to_start
1450 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001451 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001452 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001453 && to_kind == PyUnicode_4BYTE_KIND)
1454 {
1455 _PyUnicode_CONVERT_BYTES(
1456 Py_UCS1, Py_UCS4,
1457 PyUnicode_1BYTE_DATA(from) + from_start,
1458 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1459 PyUnicode_4BYTE_DATA(to) + to_start
1460 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001461 }
1462 else if (from_kind == PyUnicode_2BYTE_KIND
1463 && to_kind == PyUnicode_4BYTE_KIND)
1464 {
1465 _PyUnicode_CONVERT_BYTES(
1466 Py_UCS2, Py_UCS4,
1467 PyUnicode_2BYTE_DATA(from) + from_start,
1468 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1469 PyUnicode_4BYTE_DATA(to) + to_start
1470 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001471 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001472 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001473 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1474
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001475 if (!check_maxchar) {
1476 if (from_kind == PyUnicode_2BYTE_KIND
1477 && to_kind == PyUnicode_1BYTE_KIND)
1478 {
1479 _PyUnicode_CONVERT_BYTES(
1480 Py_UCS2, Py_UCS1,
1481 PyUnicode_2BYTE_DATA(from) + from_start,
1482 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1483 PyUnicode_1BYTE_DATA(to) + to_start
1484 );
1485 }
1486 else if (from_kind == PyUnicode_4BYTE_KIND
1487 && to_kind == PyUnicode_1BYTE_KIND)
1488 {
1489 _PyUnicode_CONVERT_BYTES(
1490 Py_UCS4, Py_UCS1,
1491 PyUnicode_4BYTE_DATA(from) + from_start,
1492 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1493 PyUnicode_1BYTE_DATA(to) + to_start
1494 );
1495 }
1496 else if (from_kind == PyUnicode_4BYTE_KIND
1497 && to_kind == PyUnicode_2BYTE_KIND)
1498 {
1499 _PyUnicode_CONVERT_BYTES(
1500 Py_UCS4, Py_UCS2,
1501 PyUnicode_4BYTE_DATA(from) + from_start,
1502 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1503 PyUnicode_2BYTE_DATA(to) + to_start
1504 );
1505 }
1506 else {
1507 assert(0);
1508 return -1;
1509 }
1510 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001511 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001513 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001514 Py_ssize_t i;
1515
Victor Stinnera0702ab2011-09-29 14:14:38 +02001516 for (i=0; i < how_many; i++) {
1517 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001518 if (ch > to_maxchar)
1519 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001520 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1521 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001522 }
1523 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524 return 0;
1525}
1526
Victor Stinnerd3f08822012-05-29 12:57:52 +02001527void
1528_PyUnicode_FastCopyCharacters(
1529 PyObject *to, Py_ssize_t to_start,
1530 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001531{
1532 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1533}
1534
1535Py_ssize_t
1536PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1537 PyObject *from, Py_ssize_t from_start,
1538 Py_ssize_t how_many)
1539{
1540 int err;
1541
1542 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1543 PyErr_BadInternalCall();
1544 return -1;
1545 }
1546
Benjamin Petersonbac79492012-01-14 13:34:47 -05001547 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001549 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001550 return -1;
1551
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001552 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001553 PyErr_SetString(PyExc_IndexError, "string index out of range");
1554 return -1;
1555 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001556 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001557 PyErr_SetString(PyExc_IndexError, "string index out of range");
1558 return -1;
1559 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001560 if (how_many < 0) {
1561 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1562 return -1;
1563 }
1564 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1566 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001567 "Cannot write %zi characters at %zi "
1568 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001569 how_many, to_start, PyUnicode_GET_LENGTH(to));
1570 return -1;
1571 }
1572
1573 if (how_many == 0)
1574 return 0;
1575
Victor Stinner488fa492011-12-12 00:01:39 +01001576 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001577 return -1;
1578
1579 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1580 if (err) {
1581 PyErr_Format(PyExc_SystemError,
1582 "Cannot copy %s characters "
1583 "into a string of %s characters",
1584 unicode_kind_name(from),
1585 unicode_kind_name(to));
1586 return -1;
1587 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001588 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589}
1590
Victor Stinner17222162011-09-28 22:15:37 +02001591/* Find the maximum code point and count the number of surrogate pairs so a
1592 correct string length can be computed before converting a string to UCS4.
1593 This function counts single surrogates as a character and not as a pair.
1594
1595 Return 0 on success, or -1 on error. */
1596static int
1597find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1598 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599{
1600 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001601 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602
Victor Stinnerc53be962011-10-02 21:33:54 +02001603 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604 *num_surrogates = 0;
1605 *maxchar = 0;
1606
1607 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001608#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001609 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1610 && (iter+1) < end
1611 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1612 {
1613 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1614 ++(*num_surrogates);
1615 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616 }
1617 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001618#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001619 {
1620 ch = *iter;
1621 iter++;
1622 }
1623 if (ch > *maxchar) {
1624 *maxchar = ch;
1625 if (*maxchar > MAX_UNICODE) {
1626 PyErr_Format(PyExc_ValueError,
1627 "character U+%x is not in range [U+0000; U+10ffff]",
1628 ch);
1629 return -1;
1630 }
1631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632 }
1633 return 0;
1634}
1635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001636int
1637_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638{
1639 wchar_t *end;
1640 Py_UCS4 maxchar = 0;
1641 Py_ssize_t num_surrogates;
1642#if SIZEOF_WCHAR_T == 2
1643 Py_ssize_t length_wo_surrogates;
1644#endif
1645
Georg Brandl7597add2011-10-05 16:36:47 +02001646 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001647 strings were created using _PyObject_New() and where no canonical
1648 representation (the str field) has been set yet aka strings
1649 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001650 assert(_PyUnicode_CHECK(unicode));
1651 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001653 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001654 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001655 /* Actually, it should neither be interned nor be anything else: */
1656 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001659 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001660 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662
1663 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001664 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1665 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 PyErr_NoMemory();
1667 return -1;
1668 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001669 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001670 _PyUnicode_WSTR(unicode), end,
1671 PyUnicode_1BYTE_DATA(unicode));
1672 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1673 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1674 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1675 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001676 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001677 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001678 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 }
1680 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001681 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001682 _PyUnicode_UTF8(unicode) = NULL;
1683 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 }
1685 PyObject_FREE(_PyUnicode_WSTR(unicode));
1686 _PyUnicode_WSTR(unicode) = NULL;
1687 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1688 }
1689 /* In this case we might have to convert down from 4-byte native
1690 wchar_t to 2-byte unicode. */
1691 else if (maxchar < 65536) {
1692 assert(num_surrogates == 0 &&
1693 "FindMaxCharAndNumSurrogatePairs() messed up");
1694
Victor Stinner506f5922011-09-28 22:34:18 +02001695#if SIZEOF_WCHAR_T == 2
1696 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001697 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001698 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1699 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1700 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001701 _PyUnicode_UTF8(unicode) = NULL;
1702 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001703#else
1704 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001705 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001706 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001707 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001708 PyErr_NoMemory();
1709 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 }
Victor Stinner506f5922011-09-28 22:34:18 +02001711 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1712 _PyUnicode_WSTR(unicode), end,
1713 PyUnicode_2BYTE_DATA(unicode));
1714 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1715 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1716 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001717 _PyUnicode_UTF8(unicode) = NULL;
1718 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001719 PyObject_FREE(_PyUnicode_WSTR(unicode));
1720 _PyUnicode_WSTR(unicode) = NULL;
1721 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1722#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 }
1724 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1725 else {
1726#if SIZEOF_WCHAR_T == 2
1727 /* in case the native representation is 2-bytes, we need to allocate a
1728 new normalized 4-byte version. */
1729 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001730 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1731 PyErr_NoMemory();
1732 return -1;
1733 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001734 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1735 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 PyErr_NoMemory();
1737 return -1;
1738 }
1739 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1740 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001741 _PyUnicode_UTF8(unicode) = NULL;
1742 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001743 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1744 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001745 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746 PyObject_FREE(_PyUnicode_WSTR(unicode));
1747 _PyUnicode_WSTR(unicode) = NULL;
1748 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1749#else
1750 assert(num_surrogates == 0);
1751
Victor Stinnerc3c74152011-10-02 20:39:55 +02001752 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001754 _PyUnicode_UTF8(unicode) = NULL;
1755 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1757#endif
1758 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1759 }
1760 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001761 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 return 0;
1763}
1764
Alexander Belopolsky40018472011-02-26 01:02:56 +00001765static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001766unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767{
Walter Dörwald16807132007-05-25 13:52:07 +00001768 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001769 case SSTATE_NOT_INTERNED:
1770 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001771
Benjamin Peterson29060642009-01-31 22:14:21 +00001772 case SSTATE_INTERNED_MORTAL:
1773 /* revive dead object temporarily for DelItem */
1774 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001775 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001776 Py_FatalError(
1777 "deletion of interned string failed");
1778 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001779
Benjamin Peterson29060642009-01-31 22:14:21 +00001780 case SSTATE_INTERNED_IMMORTAL:
1781 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001782
Benjamin Peterson29060642009-01-31 22:14:21 +00001783 default:
1784 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001785 }
1786
Victor Stinner03490912011-10-03 23:45:12 +02001787 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001789 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001790 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001791 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1792 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001794 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795}
1796
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001797#ifdef Py_DEBUG
1798static int
1799unicode_is_singleton(PyObject *unicode)
1800{
1801 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1802 if (unicode == unicode_empty)
1803 return 1;
1804 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1805 {
1806 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1807 if (ch < 256 && unicode_latin1[ch] == unicode)
1808 return 1;
1809 }
1810 return 0;
1811}
1812#endif
1813
Alexander Belopolsky40018472011-02-26 01:02:56 +00001814static int
Victor Stinner488fa492011-12-12 00:01:39 +01001815unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001816{
Victor Stinner488fa492011-12-12 00:01:39 +01001817 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001818 if (Py_REFCNT(unicode) != 1)
1819 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001820 if (_PyUnicode_HASH(unicode) != -1)
1821 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001822 if (PyUnicode_CHECK_INTERNED(unicode))
1823 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001824 if (!PyUnicode_CheckExact(unicode))
1825 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001826#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001827 /* singleton refcount is greater than 1 */
1828 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001829#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001830 return 1;
1831}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001832
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833static int
1834unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1835{
1836 PyObject *unicode;
1837 Py_ssize_t old_length;
1838
1839 assert(p_unicode != NULL);
1840 unicode = *p_unicode;
1841
1842 assert(unicode != NULL);
1843 assert(PyUnicode_Check(unicode));
1844 assert(0 <= length);
1845
Victor Stinner910337b2011-10-03 03:20:16 +02001846 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001847 old_length = PyUnicode_WSTR_LENGTH(unicode);
1848 else
1849 old_length = PyUnicode_GET_LENGTH(unicode);
1850 if (old_length == length)
1851 return 0;
1852
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001853 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001854 _Py_INCREF_UNICODE_EMPTY();
1855 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001856 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001857 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001858 return 0;
1859 }
1860
Victor Stinner488fa492011-12-12 00:01:39 +01001861 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 PyObject *copy = resize_copy(unicode, length);
1863 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001864 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001865 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001866 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001867 }
1868
Victor Stinnerfe226c02011-10-03 03:52:20 +02001869 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001870 PyObject *new_unicode = resize_compact(unicode, length);
1871 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001872 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001873 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001874 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001875 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001876 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001877}
1878
Alexander Belopolsky40018472011-02-26 01:02:56 +00001879int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001880PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001881{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001882 PyObject *unicode;
1883 if (p_unicode == NULL) {
1884 PyErr_BadInternalCall();
1885 return -1;
1886 }
1887 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001888 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 {
1890 PyErr_BadInternalCall();
1891 return -1;
1892 }
1893 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001894}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001895
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001896/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001897
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001898 WARNING: The function doesn't copy the terminating null character and
1899 doesn't check the maximum character (may write a latin1 character in an
1900 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001901static void
1902unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1903 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001904{
1905 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1906 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001907 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001908
1909 switch (kind) {
1910 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001911 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001912#ifdef Py_DEBUG
1913 if (PyUnicode_IS_ASCII(unicode)) {
1914 Py_UCS4 maxchar = ucs1lib_find_max_char(
1915 (const Py_UCS1*)str,
1916 (const Py_UCS1*)str + len);
1917 assert(maxchar < 128);
1918 }
1919#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001920 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001921 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001922 }
1923 case PyUnicode_2BYTE_KIND: {
1924 Py_UCS2 *start = (Py_UCS2 *)data + index;
1925 Py_UCS2 *ucs2 = start;
1926 assert(index <= PyUnicode_GET_LENGTH(unicode));
1927
Victor Stinner184252a2012-06-16 02:57:41 +02001928 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001929 *ucs2 = (Py_UCS2)*str;
1930
1931 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001932 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001933 }
1934 default: {
1935 Py_UCS4 *start = (Py_UCS4 *)data + index;
1936 Py_UCS4 *ucs4 = start;
1937 assert(kind == PyUnicode_4BYTE_KIND);
1938 assert(index <= PyUnicode_GET_LENGTH(unicode));
1939
Victor Stinner184252a2012-06-16 02:57:41 +02001940 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001941 *ucs4 = (Py_UCS4)*str;
1942
1943 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001944 }
1945 }
1946}
1947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948static PyObject*
1949get_latin1_char(unsigned char ch)
1950{
Victor Stinnera464fc12011-10-02 20:39:30 +02001951 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001953 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 if (!unicode)
1955 return NULL;
1956 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001957 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 unicode_latin1[ch] = unicode;
1959 }
1960 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001961 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001962}
1963
Victor Stinner985a82a2014-01-03 12:53:47 +01001964static PyObject*
1965unicode_char(Py_UCS4 ch)
1966{
1967 PyObject *unicode;
1968
1969 assert(ch <= MAX_UNICODE);
1970
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001971 if (ch < 256)
1972 return get_latin1_char(ch);
1973
Victor Stinner985a82a2014-01-03 12:53:47 +01001974 unicode = PyUnicode_New(1, ch);
1975 if (unicode == NULL)
1976 return NULL;
1977 switch (PyUnicode_KIND(unicode)) {
1978 case PyUnicode_1BYTE_KIND:
1979 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1980 break;
1981 case PyUnicode_2BYTE_KIND:
1982 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1983 break;
1984 default:
1985 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1986 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1987 }
1988 assert(_PyUnicode_CheckConsistency(unicode, 1));
1989 return unicode;
1990}
1991
Alexander Belopolsky40018472011-02-26 01:02:56 +00001992PyObject *
1993PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001995 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 Py_UCS4 maxchar = 0;
1997 Py_ssize_t num_surrogates;
1998
1999 if (u == NULL)
2000 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002002 /* If the Unicode data is known at construction time, we can apply
2003 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002006 if (size == 0)
2007 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 /* Single character Unicode objects in the Latin-1 range are
2010 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002011 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 return get_latin1_char((unsigned char)*u);
2013
2014 /* If not empty and not single character, copy the Unicode data
2015 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002016 if (find_maxchar_surrogates(u, u + size,
2017 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018 return NULL;
2019
Victor Stinner8faf8212011-12-08 22:14:11 +01002020 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 if (!unicode)
2022 return NULL;
2023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 switch (PyUnicode_KIND(unicode)) {
2025 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002026 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2028 break;
2029 case PyUnicode_2BYTE_KIND:
2030#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002031 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002033 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2035#endif
2036 break;
2037 case PyUnicode_4BYTE_KIND:
2038#if SIZEOF_WCHAR_T == 2
2039 /* This is the only case which has to process surrogates, thus
2040 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002041 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042#else
2043 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002044 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045#endif
2046 break;
2047 default:
2048 assert(0 && "Impossible state");
2049 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002051 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052}
2053
Alexander Belopolsky40018472011-02-26 01:02:56 +00002054PyObject *
2055PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002056{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002057 if (size < 0) {
2058 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002059 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002060 return NULL;
2061 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002062 if (u != NULL)
2063 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2064 else
2065 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002066}
2067
Alexander Belopolsky40018472011-02-26 01:02:56 +00002068PyObject *
2069PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002070{
2071 size_t size = strlen(u);
2072 if (size > PY_SSIZE_T_MAX) {
2073 PyErr_SetString(PyExc_OverflowError, "input too long");
2074 return NULL;
2075 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002076 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002077}
2078
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002079PyObject *
2080_PyUnicode_FromId(_Py_Identifier *id)
2081{
2082 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002083 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2084 strlen(id->string),
2085 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002086 if (!id->object)
2087 return NULL;
2088 PyUnicode_InternInPlace(&id->object);
2089 assert(!id->next);
2090 id->next = static_strings;
2091 static_strings = id;
2092 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002093 return id->object;
2094}
2095
2096void
2097_PyUnicode_ClearStaticStrings()
2098{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002099 _Py_Identifier *tmp, *s = static_strings;
2100 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002101 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002102 tmp = s->next;
2103 s->next = NULL;
2104 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002105 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002106 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002107}
2108
Benjamin Peterson0df54292012-03-26 14:50:32 -04002109/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002110
Victor Stinnerd3f08822012-05-29 12:57:52 +02002111PyObject*
2112_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002113{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002114 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002115 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002116 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002117#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002118 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002119#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002120 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002121 }
Victor Stinner785938e2011-12-11 20:09:03 +01002122 unicode = PyUnicode_New(size, 127);
2123 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002124 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002125 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2126 assert(_PyUnicode_CheckConsistency(unicode, 1));
2127 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002128}
2129
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002130static Py_UCS4
2131kind_maxchar_limit(unsigned int kind)
2132{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002133 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002134 case PyUnicode_1BYTE_KIND:
2135 return 0x80;
2136 case PyUnicode_2BYTE_KIND:
2137 return 0x100;
2138 case PyUnicode_4BYTE_KIND:
2139 return 0x10000;
2140 default:
2141 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002142 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002143 }
2144}
2145
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002146static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002147align_maxchar(Py_UCS4 maxchar)
2148{
2149 if (maxchar <= 127)
2150 return 127;
2151 else if (maxchar <= 255)
2152 return 255;
2153 else if (maxchar <= 65535)
2154 return 65535;
2155 else
2156 return MAX_UNICODE;
2157}
2158
Victor Stinner702c7342011-10-05 13:50:52 +02002159static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002160_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002163 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002164
Serhiy Storchaka678db842013-01-26 12:16:36 +02002165 if (size == 0)
2166 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002167 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002168 if (size == 1)
2169 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002170
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002171 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002172 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 if (!res)
2174 return NULL;
2175 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002176 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002178}
2179
Victor Stinnere57b1c02011-09-28 22:20:48 +02002180static PyObject*
2181_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182{
2183 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002184 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002185
Serhiy Storchaka678db842013-01-26 12:16:36 +02002186 if (size == 0)
2187 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002188 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002189 if (size == 1)
2190 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002191
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002192 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002193 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 if (!res)
2195 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002196 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002198 else {
2199 _PyUnicode_CONVERT_BYTES(
2200 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2201 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002202 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 return res;
2204}
2205
Victor Stinnere57b1c02011-09-28 22:20:48 +02002206static PyObject*
2207_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208{
2209 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002210 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002211
Serhiy Storchaka678db842013-01-26 12:16:36 +02002212 if (size == 0)
2213 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002214 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002215 if (size == 1)
2216 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002217
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002218 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002219 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 if (!res)
2221 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002222 if (max_char < 256)
2223 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2224 PyUnicode_1BYTE_DATA(res));
2225 else if (max_char < 0x10000)
2226 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2227 PyUnicode_2BYTE_DATA(res));
2228 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002230 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 return res;
2232}
2233
2234PyObject*
2235PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2236{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002237 if (size < 0) {
2238 PyErr_SetString(PyExc_ValueError, "size must be positive");
2239 return NULL;
2240 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002241 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002243 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002245 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002247 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002248 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002249 PyErr_SetString(PyExc_SystemError, "invalid kind");
2250 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252}
2253
Victor Stinnerece58de2012-04-23 23:36:38 +02002254Py_UCS4
2255_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2256{
2257 enum PyUnicode_Kind kind;
2258 void *startptr, *endptr;
2259
2260 assert(PyUnicode_IS_READY(unicode));
2261 assert(0 <= start);
2262 assert(end <= PyUnicode_GET_LENGTH(unicode));
2263 assert(start <= end);
2264
2265 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2266 return PyUnicode_MAX_CHAR_VALUE(unicode);
2267
2268 if (start == end)
2269 return 127;
2270
Victor Stinner94d558b2012-04-27 22:26:58 +02002271 if (PyUnicode_IS_ASCII(unicode))
2272 return 127;
2273
Victor Stinnerece58de2012-04-23 23:36:38 +02002274 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002275 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002276 endptr = (char *)startptr + end * kind;
2277 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002278 switch(kind) {
2279 case PyUnicode_1BYTE_KIND:
2280 return ucs1lib_find_max_char(startptr, endptr);
2281 case PyUnicode_2BYTE_KIND:
2282 return ucs2lib_find_max_char(startptr, endptr);
2283 case PyUnicode_4BYTE_KIND:
2284 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002285 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002286 assert(0);
2287 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002288 }
2289}
2290
Victor Stinner25a4b292011-10-06 12:31:55 +02002291/* Ensure that a string uses the most efficient storage, if it is not the
2292 case: create a new string with of the right kind. Write NULL into *p_unicode
2293 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002294static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002295unicode_adjust_maxchar(PyObject **p_unicode)
2296{
2297 PyObject *unicode, *copy;
2298 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002299 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002300 unsigned int kind;
2301
2302 assert(p_unicode != NULL);
2303 unicode = *p_unicode;
2304 assert(PyUnicode_IS_READY(unicode));
2305 if (PyUnicode_IS_ASCII(unicode))
2306 return;
2307
2308 len = PyUnicode_GET_LENGTH(unicode);
2309 kind = PyUnicode_KIND(unicode);
2310 if (kind == PyUnicode_1BYTE_KIND) {
2311 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002312 max_char = ucs1lib_find_max_char(u, u + len);
2313 if (max_char >= 128)
2314 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002315 }
2316 else if (kind == PyUnicode_2BYTE_KIND) {
2317 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002318 max_char = ucs2lib_find_max_char(u, u + len);
2319 if (max_char >= 256)
2320 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002321 }
2322 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002323 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002324 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002325 max_char = ucs4lib_find_max_char(u, u + len);
2326 if (max_char >= 0x10000)
2327 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002328 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002329 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002330 if (copy != NULL)
2331 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002332 Py_DECREF(unicode);
2333 *p_unicode = copy;
2334}
2335
Victor Stinner034f6cf2011-09-30 02:26:44 +02002336PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002337_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002338{
Victor Stinner87af4f22011-11-21 23:03:47 +01002339 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002340 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002341
Victor Stinner034f6cf2011-09-30 02:26:44 +02002342 if (!PyUnicode_Check(unicode)) {
2343 PyErr_BadInternalCall();
2344 return NULL;
2345 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002346 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002347 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002348
Victor Stinner87af4f22011-11-21 23:03:47 +01002349 length = PyUnicode_GET_LENGTH(unicode);
2350 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002351 if (!copy)
2352 return NULL;
2353 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2354
Christian Heimesf051e432016-09-13 20:22:02 +02002355 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002356 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002357 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002358 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002359}
2360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361
Victor Stinnerbc603d12011-10-02 01:00:40 +02002362/* Widen Unicode objects to larger buffers. Don't write terminating null
2363 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002364
2365void*
2366_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2367{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002368 Py_ssize_t len;
2369 void *result;
2370 unsigned int skind;
2371
Benjamin Petersonbac79492012-01-14 13:34:47 -05002372 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002373 return NULL;
2374
2375 len = PyUnicode_GET_LENGTH(s);
2376 skind = PyUnicode_KIND(s);
2377 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002378 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002379 return NULL;
2380 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002381 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002382 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002383 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002384 if (!result)
2385 return PyErr_NoMemory();
2386 assert(skind == PyUnicode_1BYTE_KIND);
2387 _PyUnicode_CONVERT_BYTES(
2388 Py_UCS1, Py_UCS2,
2389 PyUnicode_1BYTE_DATA(s),
2390 PyUnicode_1BYTE_DATA(s) + len,
2391 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002393 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002394 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002395 if (!result)
2396 return PyErr_NoMemory();
2397 if (skind == PyUnicode_2BYTE_KIND) {
2398 _PyUnicode_CONVERT_BYTES(
2399 Py_UCS2, Py_UCS4,
2400 PyUnicode_2BYTE_DATA(s),
2401 PyUnicode_2BYTE_DATA(s) + len,
2402 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002404 else {
2405 assert(skind == PyUnicode_1BYTE_KIND);
2406 _PyUnicode_CONVERT_BYTES(
2407 Py_UCS1, Py_UCS4,
2408 PyUnicode_1BYTE_DATA(s),
2409 PyUnicode_1BYTE_DATA(s) + len,
2410 result);
2411 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002413 default:
2414 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 }
Victor Stinner01698042011-10-04 00:04:26 +02002416 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 return NULL;
2418}
2419
2420static Py_UCS4*
2421as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2422 int copy_null)
2423{
2424 int kind;
2425 void *data;
2426 Py_ssize_t len, targetlen;
2427 if (PyUnicode_READY(string) == -1)
2428 return NULL;
2429 kind = PyUnicode_KIND(string);
2430 data = PyUnicode_DATA(string);
2431 len = PyUnicode_GET_LENGTH(string);
2432 targetlen = len;
2433 if (copy_null)
2434 targetlen++;
2435 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002436 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437 if (!target) {
2438 PyErr_NoMemory();
2439 return NULL;
2440 }
2441 }
2442 else {
2443 if (targetsize < targetlen) {
2444 PyErr_Format(PyExc_SystemError,
2445 "string is longer than the buffer");
2446 if (copy_null && 0 < targetsize)
2447 target[0] = 0;
2448 return NULL;
2449 }
2450 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002451 if (kind == PyUnicode_1BYTE_KIND) {
2452 Py_UCS1 *start = (Py_UCS1 *) data;
2453 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002455 else if (kind == PyUnicode_2BYTE_KIND) {
2456 Py_UCS2 *start = (Py_UCS2 *) data;
2457 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2458 }
2459 else {
2460 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002461 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 if (copy_null)
2464 target[len] = 0;
2465 return target;
2466}
2467
2468Py_UCS4*
2469PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2470 int copy_null)
2471{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002472 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 PyErr_BadInternalCall();
2474 return NULL;
2475 }
2476 return as_ucs4(string, target, targetsize, copy_null);
2477}
2478
2479Py_UCS4*
2480PyUnicode_AsUCS4Copy(PyObject *string)
2481{
2482 return as_ucs4(string, NULL, 0, 1);
2483}
2484
2485#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002486
Alexander Belopolsky40018472011-02-26 01:02:56 +00002487PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002488PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002492 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002493 PyErr_BadInternalCall();
2494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 }
2496
Martin v. Löwis790465f2008-04-05 20:41:37 +00002497 if (size == -1) {
2498 size = wcslen(w);
2499 }
2500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002501 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502}
2503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002505
Victor Stinner15a11362012-10-06 23:48:20 +02002506/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002507 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2508 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2509#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002510
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002511static int
2512unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2513 Py_ssize_t width, Py_ssize_t precision)
2514{
2515 Py_ssize_t length, fill, arglen;
2516 Py_UCS4 maxchar;
2517
2518 if (PyUnicode_READY(str) == -1)
2519 return -1;
2520
2521 length = PyUnicode_GET_LENGTH(str);
2522 if ((precision == -1 || precision >= length)
2523 && width <= length)
2524 return _PyUnicodeWriter_WriteStr(writer, str);
2525
2526 if (precision != -1)
2527 length = Py_MIN(precision, length);
2528
2529 arglen = Py_MAX(length, width);
2530 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2531 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2532 else
2533 maxchar = writer->maxchar;
2534
2535 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2536 return -1;
2537
2538 if (width > length) {
2539 fill = width - length;
2540 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2541 return -1;
2542 writer->pos += fill;
2543 }
2544
2545 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2546 str, 0, length);
2547 writer->pos += length;
2548 return 0;
2549}
2550
2551static int
2552unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2553 Py_ssize_t width, Py_ssize_t precision)
2554{
2555 /* UTF-8 */
2556 Py_ssize_t length;
2557 PyObject *unicode;
2558 int res;
2559
2560 length = strlen(str);
2561 if (precision != -1)
2562 length = Py_MIN(length, precision);
2563 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2564 if (unicode == NULL)
2565 return -1;
2566
2567 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2568 Py_DECREF(unicode);
2569 return res;
2570}
2571
Victor Stinner96865452011-03-01 23:44:09 +00002572static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002573unicode_fromformat_arg(_PyUnicodeWriter *writer,
2574 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002575{
Victor Stinnere215d962012-10-06 23:03:36 +02002576 const char *p;
2577 Py_ssize_t len;
2578 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002579 Py_ssize_t width;
2580 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002581 int longflag;
2582 int longlongflag;
2583 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002584 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002585
2586 p = f;
2587 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002588 zeropad = 0;
2589 if (*f == '0') {
2590 zeropad = 1;
2591 f++;
2592 }
Victor Stinner96865452011-03-01 23:44:09 +00002593
2594 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 width = -1;
2596 if (Py_ISDIGIT((unsigned)*f)) {
2597 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002598 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002599 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002600 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002601 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002602 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002603 return NULL;
2604 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002605 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002606 f++;
2607 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002608 }
2609 precision = -1;
2610 if (*f == '.') {
2611 f++;
2612 if (Py_ISDIGIT((unsigned)*f)) {
2613 precision = (*f - '0');
2614 f++;
2615 while (Py_ISDIGIT((unsigned)*f)) {
2616 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2617 PyErr_SetString(PyExc_ValueError,
2618 "precision too big");
2619 return NULL;
2620 }
2621 precision = (precision * 10) + (*f - '0');
2622 f++;
2623 }
2624 }
Victor Stinner96865452011-03-01 23:44:09 +00002625 if (*f == '%') {
2626 /* "%.3%s" => f points to "3" */
2627 f--;
2628 }
2629 }
2630 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002631 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002632 f--;
2633 }
Victor Stinner96865452011-03-01 23:44:09 +00002634
2635 /* Handle %ld, %lu, %lld and %llu. */
2636 longflag = 0;
2637 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002638 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002639 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002640 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002641 longflag = 1;
2642 ++f;
2643 }
Victor Stinner96865452011-03-01 23:44:09 +00002644 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002645 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002646 longlongflag = 1;
2647 f += 2;
2648 }
Victor Stinner96865452011-03-01 23:44:09 +00002649 }
2650 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002651 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002652 size_tflag = 1;
2653 ++f;
2654 }
Victor Stinnere215d962012-10-06 23:03:36 +02002655
2656 if (f[1] == '\0')
2657 writer->overallocate = 0;
2658
2659 switch (*f) {
2660 case 'c':
2661 {
2662 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002663 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002664 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002665 "character argument not in range(0x110000)");
2666 return NULL;
2667 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002668 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002669 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002670 break;
2671 }
2672
2673 case 'i':
2674 case 'd':
2675 case 'u':
2676 case 'x':
2677 {
2678 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002679 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002680 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002681
2682 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002683 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002684 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002685 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002686 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002687 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002688 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002689 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002690 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002691 va_arg(*vargs, size_t));
2692 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002693 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002694 va_arg(*vargs, unsigned int));
2695 }
2696 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002697 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002698 }
2699 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002700 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002701 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002702 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002703 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002704 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002705 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002706 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002707 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002708 va_arg(*vargs, Py_ssize_t));
2709 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002710 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002711 va_arg(*vargs, int));
2712 }
2713 assert(len >= 0);
2714
Victor Stinnere215d962012-10-06 23:03:36 +02002715 if (precision < len)
2716 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002717
2718 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002719 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2720 return NULL;
2721
Victor Stinnere215d962012-10-06 23:03:36 +02002722 if (width > precision) {
2723 Py_UCS4 fillchar;
2724 fill = width - precision;
2725 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002726 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2727 return NULL;
2728 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002729 }
Victor Stinner15a11362012-10-06 23:48:20 +02002730 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002731 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002732 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2733 return NULL;
2734 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002735 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002736
Victor Stinner4a587072013-11-19 12:54:53 +01002737 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2738 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002739 break;
2740 }
2741
2742 case 'p':
2743 {
2744 char number[MAX_LONG_LONG_CHARS];
2745
2746 len = sprintf(number, "%p", va_arg(*vargs, void*));
2747 assert(len >= 0);
2748
2749 /* %p is ill-defined: ensure leading 0x. */
2750 if (number[1] == 'X')
2751 number[1] = 'x';
2752 else if (number[1] != 'x') {
2753 memmove(number + 2, number,
2754 strlen(number) + 1);
2755 number[0] = '0';
2756 number[1] = 'x';
2757 len += 2;
2758 }
2759
Victor Stinner4a587072013-11-19 12:54:53 +01002760 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002761 return NULL;
2762 break;
2763 }
2764
2765 case 's':
2766 {
2767 /* UTF-8 */
2768 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002769 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002770 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002771 break;
2772 }
2773
2774 case 'U':
2775 {
2776 PyObject *obj = va_arg(*vargs, PyObject *);
2777 assert(obj && _PyUnicode_CHECK(obj));
2778
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002779 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002780 return NULL;
2781 break;
2782 }
2783
2784 case 'V':
2785 {
2786 PyObject *obj = va_arg(*vargs, PyObject *);
2787 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002788 if (obj) {
2789 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
2792 }
2793 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002794 assert(str != NULL);
2795 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002796 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002797 }
2798 break;
2799 }
2800
2801 case 'S':
2802 {
2803 PyObject *obj = va_arg(*vargs, PyObject *);
2804 PyObject *str;
2805 assert(obj);
2806 str = PyObject_Str(obj);
2807 if (!str)
2808 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002809 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002810 Py_DECREF(str);
2811 return NULL;
2812 }
2813 Py_DECREF(str);
2814 break;
2815 }
2816
2817 case 'R':
2818 {
2819 PyObject *obj = va_arg(*vargs, PyObject *);
2820 PyObject *repr;
2821 assert(obj);
2822 repr = PyObject_Repr(obj);
2823 if (!repr)
2824 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002825 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002826 Py_DECREF(repr);
2827 return NULL;
2828 }
2829 Py_DECREF(repr);
2830 break;
2831 }
2832
2833 case 'A':
2834 {
2835 PyObject *obj = va_arg(*vargs, PyObject *);
2836 PyObject *ascii;
2837 assert(obj);
2838 ascii = PyObject_ASCII(obj);
2839 if (!ascii)
2840 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002841 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002842 Py_DECREF(ascii);
2843 return NULL;
2844 }
2845 Py_DECREF(ascii);
2846 break;
2847 }
2848
2849 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002850 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002851 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002852 break;
2853
2854 default:
2855 /* if we stumble upon an unknown formatting code, copy the rest
2856 of the format string to the output string. (we cannot just
2857 skip the code, since there's no way to know what's in the
2858 argument list) */
2859 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002860 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002861 return NULL;
2862 f = p+len;
2863 return f;
2864 }
2865
2866 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002867 return f;
2868}
2869
Walter Dörwaldd2034312007-05-18 16:29:38 +00002870PyObject *
2871PyUnicode_FromFormatV(const char *format, va_list vargs)
2872{
Victor Stinnere215d962012-10-06 23:03:36 +02002873 va_list vargs2;
2874 const char *f;
2875 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002876
Victor Stinner8f674cc2013-04-17 23:02:17 +02002877 _PyUnicodeWriter_Init(&writer);
2878 writer.min_length = strlen(format) + 100;
2879 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002880
Benjamin Peterson0c212142016-09-20 20:39:33 -07002881 // Copy varags to be able to pass a reference to a subfunction.
2882 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002883
2884 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002885 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002886 f = unicode_fromformat_arg(&writer, f, &vargs2);
2887 if (f == NULL)
2888 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002890 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002891 const char *p;
2892 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002893
Victor Stinnere215d962012-10-06 23:03:36 +02002894 p = f;
2895 do
2896 {
2897 if ((unsigned char)*p > 127) {
Christian Heimes2f2fee12016-09-21 11:37:27 +02002898 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002899 PyErr_Format(PyExc_ValueError,
2900 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2901 "string, got a non-ASCII byte: 0x%02x",
2902 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002903 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002904 }
2905 p++;
2906 }
2907 while (*p != '\0' && *p != '%');
2908 len = p - f;
2909
2910 if (*p == '\0')
2911 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002912
2913 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002914 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002915
2916 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002917 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002918 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002919 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002920 return _PyUnicodeWriter_Finish(&writer);
2921
2922 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002923 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002924 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002925 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002926}
2927
Walter Dörwaldd2034312007-05-18 16:29:38 +00002928PyObject *
2929PyUnicode_FromFormat(const char *format, ...)
2930{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002931 PyObject* ret;
2932 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002933
2934#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002935 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002936#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002937 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002938#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002939 ret = PyUnicode_FromFormatV(format, vargs);
2940 va_end(vargs);
2941 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002942}
2943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002944#ifdef HAVE_WCHAR_H
2945
Victor Stinner5593d8a2010-10-02 11:11:27 +00002946/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2947 convert a Unicode object to a wide character string.
2948
Victor Stinnerd88d9832011-09-06 02:00:05 +02002949 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002950 character) required to convert the unicode object. Ignore size argument.
2951
Victor Stinnerd88d9832011-09-06 02:00:05 +02002952 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002953 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002954 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002955static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002956unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002957 wchar_t *w,
2958 Py_ssize_t size)
2959{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002961 const wchar_t *wstr;
2962
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002963 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002964 if (wstr == NULL)
2965 return -1;
2966
Victor Stinner5593d8a2010-10-02 11:11:27 +00002967 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002968 if (size > res)
2969 size = res + 1;
2970 else
2971 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002972 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002973 return res;
2974 }
2975 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002976 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002977}
2978
2979Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002980PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002981 wchar_t *w,
2982 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983{
2984 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002985 PyErr_BadInternalCall();
2986 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002988 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989}
2990
Victor Stinner137c34c2010-09-29 10:25:54 +00002991wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002992PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002993 Py_ssize_t *size)
2994{
2995 wchar_t* buffer;
2996 Py_ssize_t buflen;
2997
2998 if (unicode == NULL) {
2999 PyErr_BadInternalCall();
3000 return NULL;
3001 }
3002
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003003 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003004 if (buflen == -1)
3005 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003006 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003007 if (buffer == NULL) {
3008 PyErr_NoMemory();
3009 return NULL;
3010 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003011 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003012 if (buflen == -1) {
3013 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003014 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003015 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003016 if (size != NULL)
3017 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003018 return buffer;
3019}
3020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003021#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022
Alexander Belopolsky40018472011-02-26 01:02:56 +00003023PyObject *
3024PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003025{
Victor Stinner8faf8212011-12-08 22:14:11 +01003026 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003027 PyErr_SetString(PyExc_ValueError,
3028 "chr() arg not in range(0x110000)");
3029 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003030 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003031
Victor Stinner985a82a2014-01-03 12:53:47 +01003032 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003033}
3034
Alexander Belopolsky40018472011-02-26 01:02:56 +00003035PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003036PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003038 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003039 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003040 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003041 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003042 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003043 Py_INCREF(obj);
3044 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003045 }
3046 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003047 /* For a Unicode subtype that's not a Unicode object,
3048 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003049 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003050 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003051 PyErr_Format(PyExc_TypeError,
3052 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003053 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003054 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003055}
3056
Alexander Belopolsky40018472011-02-26 01:02:56 +00003057PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003058PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003059 const char *encoding,
3060 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003061{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003062 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003063 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003064
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003066 PyErr_BadInternalCall();
3067 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003069
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003070 /* Decoding bytes objects is the most common case and should be fast */
3071 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003072 if (PyBytes_GET_SIZE(obj) == 0)
3073 _Py_RETURN_UNICODE_EMPTY();
3074 v = PyUnicode_Decode(
3075 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3076 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003077 return v;
3078 }
3079
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003080 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003081 PyErr_SetString(PyExc_TypeError,
3082 "decoding str is not supported");
3083 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003084 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003085
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003086 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3087 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3088 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003089 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003090 Py_TYPE(obj)->tp_name);
3091 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003092 }
Tim Petersced69f82003-09-16 20:30:58 +00003093
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003094 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003095 PyBuffer_Release(&buffer);
3096 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003098
Serhiy Storchaka05997252013-01-26 12:14:02 +02003099 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003100 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003101 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102}
3103
Victor Stinner942889a2016-09-05 15:40:10 -07003104/* Normalize an encoding name: C implementation of
3105 encodings.normalize_encoding(). Return 1 on success, or 0 on error (encoding
3106 is longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003107int
3108_Py_normalize_encoding(const char *encoding,
3109 char *lower,
3110 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003112 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003113 char *l;
3114 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003115 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003116
Victor Stinner942889a2016-09-05 15:40:10 -07003117 assert(encoding != NULL);
3118
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003119 e = encoding;
3120 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003121 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003122 punct = 0;
3123 while (1) {
3124 char c = *e;
3125 if (c == 0) {
3126 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003127 }
Victor Stinner942889a2016-09-05 15:40:10 -07003128
3129 if (Py_ISALNUM(c) || c == '.') {
3130 if (punct && l != lower) {
3131 if (l == l_end) {
3132 return 0;
3133 }
3134 *l++ = '_';
3135 }
3136 punct = 0;
3137
3138 if (l == l_end) {
3139 return 0;
3140 }
3141 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003142 }
3143 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003144 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003145 }
Victor Stinner942889a2016-09-05 15:40:10 -07003146
3147 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003148 }
3149 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003150 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003151}
3152
Alexander Belopolsky40018472011-02-26 01:02:56 +00003153PyObject *
3154PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003155 Py_ssize_t size,
3156 const char *encoding,
3157 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003158{
3159 PyObject *buffer = NULL, *unicode;
3160 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003161 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3162
3163 if (encoding == NULL) {
3164 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3165 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003166
Fred Drakee4315f52000-05-09 19:53:39 +00003167 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003168 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3169 char *lower = buflower;
3170
3171 /* Fast paths */
3172 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3173 lower += 3;
3174 if (*lower == '_') {
3175 /* Match "utf8" and "utf_8" */
3176 lower++;
3177 }
3178
3179 if (lower[0] == '8' && lower[1] == 0) {
3180 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3181 }
3182 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3183 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3184 }
3185 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3186 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3187 }
3188 }
3189 else {
3190 if (strcmp(lower, "ascii") == 0
3191 || strcmp(lower, "us_ascii") == 0) {
3192 return PyUnicode_DecodeASCII(s, size, errors);
3193 }
Steve Dowercc16be82016-09-08 10:35:16 -07003194 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003195 else if (strcmp(lower, "mbcs") == 0) {
3196 return PyUnicode_DecodeMBCS(s, size, errors);
3197 }
3198 #endif
3199 else if (strcmp(lower, "latin1") == 0
3200 || strcmp(lower, "latin_1") == 0
3201 || strcmp(lower, "iso_8859_1") == 0
3202 || strcmp(lower, "iso8859_1") == 0) {
3203 return PyUnicode_DecodeLatin1(s, size, errors);
3204 }
3205 }
Victor Stinner37296e82010-06-10 13:36:23 +00003206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207
3208 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003209 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003210 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003211 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003212 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213 if (buffer == NULL)
3214 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003215 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 if (unicode == NULL)
3217 goto onError;
3218 if (!PyUnicode_Check(unicode)) {
3219 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003220 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3221 "use codecs.decode() to decode to arbitrary types",
3222 encoding,
3223 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 Py_DECREF(unicode);
3225 goto onError;
3226 }
3227 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003228 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003229
Benjamin Peterson29060642009-01-31 22:14:21 +00003230 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 Py_XDECREF(buffer);
3232 return NULL;
3233}
3234
Alexander Belopolsky40018472011-02-26 01:02:56 +00003235PyObject *
3236PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003237 const char *encoding,
3238 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003239{
3240 PyObject *v;
3241
3242 if (!PyUnicode_Check(unicode)) {
3243 PyErr_BadArgument();
3244 goto onError;
3245 }
3246
3247 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003248 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003249
3250 /* Decode via the codec registry */
3251 v = PyCodec_Decode(unicode, encoding, errors);
3252 if (v == NULL)
3253 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003254 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003255
Benjamin Peterson29060642009-01-31 22:14:21 +00003256 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003257 return NULL;
3258}
3259
Alexander Belopolsky40018472011-02-26 01:02:56 +00003260PyObject *
3261PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003262 const char *encoding,
3263 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003264{
3265 PyObject *v;
3266
3267 if (!PyUnicode_Check(unicode)) {
3268 PyErr_BadArgument();
3269 goto onError;
3270 }
3271
3272 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003273 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003274
3275 /* Decode via the codec registry */
3276 v = PyCodec_Decode(unicode, encoding, errors);
3277 if (v == NULL)
3278 goto onError;
3279 if (!PyUnicode_Check(v)) {
3280 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003281 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3282 "use codecs.decode() to decode to arbitrary types",
3283 encoding,
3284 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003285 Py_DECREF(v);
3286 goto onError;
3287 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003288 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003289
Benjamin Peterson29060642009-01-31 22:14:21 +00003290 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003291 return NULL;
3292}
3293
Alexander Belopolsky40018472011-02-26 01:02:56 +00003294PyObject *
3295PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003296 Py_ssize_t size,
3297 const char *encoding,
3298 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299{
3300 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003301
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302 unicode = PyUnicode_FromUnicode(s, size);
3303 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3306 Py_DECREF(unicode);
3307 return v;
3308}
3309
Alexander Belopolsky40018472011-02-26 01:02:56 +00003310PyObject *
3311PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003312 const char *encoding,
3313 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003314{
3315 PyObject *v;
3316
3317 if (!PyUnicode_Check(unicode)) {
3318 PyErr_BadArgument();
3319 goto onError;
3320 }
3321
3322 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003323 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003324
3325 /* Encode via the codec registry */
3326 v = PyCodec_Encode(unicode, encoding, errors);
3327 if (v == NULL)
3328 goto onError;
3329 return v;
3330
Benjamin Peterson29060642009-01-31 22:14:21 +00003331 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003332 return NULL;
3333}
3334
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003335static size_t
3336wcstombs_errorpos(const wchar_t *wstr)
3337{
3338 size_t len;
3339#if SIZEOF_WCHAR_T == 2
3340 wchar_t buf[3];
3341#else
3342 wchar_t buf[2];
3343#endif
3344 char outbuf[MB_LEN_MAX];
3345 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003346
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003347#if SIZEOF_WCHAR_T == 2
3348 buf[2] = 0;
3349#else
3350 buf[1] = 0;
3351#endif
3352 start = wstr;
3353 while (*wstr != L'\0')
3354 {
3355 previous = wstr;
3356#if SIZEOF_WCHAR_T == 2
3357 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3358 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3359 {
3360 buf[0] = wstr[0];
3361 buf[1] = wstr[1];
3362 wstr += 2;
3363 }
3364 else {
3365 buf[0] = *wstr;
3366 buf[1] = 0;
3367 wstr++;
3368 }
3369#else
3370 buf[0] = *wstr;
3371 wstr++;
3372#endif
3373 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003374 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003375 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003376 }
3377
3378 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003379 return 0;
3380}
3381
Victor Stinner1b579672011-12-17 05:47:23 +01003382static int
3383locale_error_handler(const char *errors, int *surrogateescape)
3384{
Victor Stinner50149202015-09-22 00:26:54 +02003385 _Py_error_handler error_handler = get_error_handler(errors);
3386 switch (error_handler)
3387 {
3388 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003389 *surrogateescape = 0;
3390 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003391 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003392 *surrogateescape = 1;
3393 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003394 default:
3395 PyErr_Format(PyExc_ValueError,
3396 "only 'strict' and 'surrogateescape' error handlers "
3397 "are supported, not '%s'",
3398 errors);
3399 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003400 }
Victor Stinner1b579672011-12-17 05:47:23 +01003401}
3402
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003403PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003404PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003405{
3406 Py_ssize_t wlen, wlen2;
3407 wchar_t *wstr;
3408 PyObject *bytes = NULL;
3409 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003410 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003411 PyObject *exc;
3412 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003413 int surrogateescape;
3414
3415 if (locale_error_handler(errors, &surrogateescape) < 0)
3416 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003417
3418 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3419 if (wstr == NULL)
3420 return NULL;
3421
3422 wlen2 = wcslen(wstr);
3423 if (wlen2 != wlen) {
3424 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003425 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003426 return NULL;
3427 }
3428
3429 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003430 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003431 char *str;
3432
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003433 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003434 if (str == NULL) {
3435 if (error_pos == (size_t)-1) {
3436 PyErr_NoMemory();
3437 PyMem_Free(wstr);
3438 return NULL;
3439 }
3440 else {
3441 goto encode_error;
3442 }
3443 }
3444 PyMem_Free(wstr);
3445
3446 bytes = PyBytes_FromString(str);
3447 PyMem_Free(str);
3448 }
3449 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003450 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003451 size_t len, len2;
3452
3453 len = wcstombs(NULL, wstr, 0);
3454 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003455 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003456 goto encode_error;
3457 }
3458
3459 bytes = PyBytes_FromStringAndSize(NULL, len);
3460 if (bytes == NULL) {
3461 PyMem_Free(wstr);
3462 return NULL;
3463 }
3464
3465 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3466 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003467 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003468 goto encode_error;
3469 }
3470 PyMem_Free(wstr);
3471 }
3472 return bytes;
3473
3474encode_error:
3475 errmsg = strerror(errno);
3476 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003477
3478 if (error_pos == (size_t)-1)
3479 error_pos = wcstombs_errorpos(wstr);
3480
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003481 PyMem_Free(wstr);
3482 Py_XDECREF(bytes);
3483
Victor Stinner2f197072011-12-17 07:08:30 +01003484 if (errmsg != NULL) {
3485 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003486 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003487 if (wstr != NULL) {
3488 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003489 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003490 } else
3491 errmsg = NULL;
3492 }
3493 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003494 reason = PyUnicode_FromString(
3495 "wcstombs() encountered an unencodable "
3496 "wide character");
3497 if (reason == NULL)
3498 return NULL;
3499
3500 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3501 "locale", unicode,
3502 (Py_ssize_t)error_pos,
3503 (Py_ssize_t)(error_pos+1),
3504 reason);
3505 Py_DECREF(reason);
3506 if (exc != NULL) {
3507 PyCodec_StrictErrors(exc);
3508 Py_XDECREF(exc);
3509 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003510 return NULL;
3511}
3512
Victor Stinnerad158722010-10-27 00:25:46 +00003513PyObject *
3514PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003515{
Steve Dowercc16be82016-09-08 10:35:16 -07003516#if defined(__APPLE__)
3517 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003518#else
Victor Stinner793b5312011-04-27 00:24:21 +02003519 PyInterpreterState *interp = PyThreadState_GET()->interp;
3520 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3521 cannot use it to encode and decode filenames before it is loaded. Load
3522 the Python codec requires to encode at least its own filename. Use the C
3523 version of the locale codec until the codec registry is initialized and
3524 the Python codec is loaded.
3525
3526 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3527 cannot only rely on it: check also interp->fscodec_initialized for
3528 subinterpreters. */
3529 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003530 return PyUnicode_AsEncodedString(unicode,
3531 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003532 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003533 }
3534 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003535 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003536 }
Victor Stinnerad158722010-10-27 00:25:46 +00003537#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003538}
3539
Alexander Belopolsky40018472011-02-26 01:02:56 +00003540PyObject *
3541PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003542 const char *encoding,
3543 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544{
3545 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003546 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003547
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548 if (!PyUnicode_Check(unicode)) {
3549 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003550 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551 }
Fred Drakee4315f52000-05-09 19:53:39 +00003552
Victor Stinner942889a2016-09-05 15:40:10 -07003553 if (encoding == NULL) {
3554 return _PyUnicode_AsUTF8String(unicode, errors);
3555 }
3556
Fred Drakee4315f52000-05-09 19:53:39 +00003557 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003558 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3559 char *lower = buflower;
3560
3561 /* Fast paths */
3562 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3563 lower += 3;
3564 if (*lower == '_') {
3565 /* Match "utf8" and "utf_8" */
3566 lower++;
3567 }
3568
3569 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003570 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003571 }
3572 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3573 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3574 }
3575 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3576 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3577 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003578 }
Victor Stinner942889a2016-09-05 15:40:10 -07003579 else {
3580 if (strcmp(lower, "ascii") == 0
3581 || strcmp(lower, "us_ascii") == 0) {
3582 return _PyUnicode_AsASCIIString(unicode, errors);
3583 }
Steve Dowercc16be82016-09-08 10:35:16 -07003584#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003585 else if (strcmp(lower, "mbcs") == 0) {
3586 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3587 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003588#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003589 else if (strcmp(lower, "latin1") == 0 ||
3590 strcmp(lower, "latin_1") == 0 ||
3591 strcmp(lower, "iso_8859_1") == 0 ||
3592 strcmp(lower, "iso8859_1") == 0) {
3593 return _PyUnicode_AsLatin1String(unicode, errors);
3594 }
3595 }
Victor Stinner37296e82010-06-10 13:36:23 +00003596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597
3598 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003599 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003600 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003601 return NULL;
3602
3603 /* The normal path */
3604 if (PyBytes_Check(v))
3605 return v;
3606
3607 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003608 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003609 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003610 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003611
3612 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003613 "encoder %s returned bytearray instead of bytes; "
3614 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003615 encoding);
3616 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003617 Py_DECREF(v);
3618 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003619 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003620
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003621 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3622 Py_DECREF(v);
3623 return b;
3624 }
3625
3626 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003627 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3628 "use codecs.encode() to encode to arbitrary types",
3629 encoding,
3630 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003631 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003632 return NULL;
3633}
3634
Alexander Belopolsky40018472011-02-26 01:02:56 +00003635PyObject *
3636PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003637 const char *encoding,
3638 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003639{
3640 PyObject *v;
3641
3642 if (!PyUnicode_Check(unicode)) {
3643 PyErr_BadArgument();
3644 goto onError;
3645 }
3646
3647 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003648 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003649
3650 /* Encode via the codec registry */
3651 v = PyCodec_Encode(unicode, encoding, errors);
3652 if (v == NULL)
3653 goto onError;
3654 if (!PyUnicode_Check(v)) {
3655 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003656 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3657 "use codecs.encode() to encode to arbitrary types",
3658 encoding,
3659 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003660 Py_DECREF(v);
3661 goto onError;
3662 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003664
Benjamin Peterson29060642009-01-31 22:14:21 +00003665 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666 return NULL;
3667}
3668
Victor Stinner2f197072011-12-17 07:08:30 +01003669static size_t
3670mbstowcs_errorpos(const char *str, size_t len)
3671{
3672#ifdef HAVE_MBRTOWC
3673 const char *start = str;
3674 mbstate_t mbs;
3675 size_t converted;
3676 wchar_t ch;
3677
3678 memset(&mbs, 0, sizeof mbs);
3679 while (len)
3680 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003681 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003682 if (converted == 0)
3683 /* Reached end of string */
3684 break;
3685 if (converted == (size_t)-1 || converted == (size_t)-2) {
3686 /* Conversion error or incomplete character */
3687 return str - start;
3688 }
3689 else {
3690 str += converted;
3691 len -= converted;
3692 }
3693 }
3694 /* failed to find the undecodable byte sequence */
3695 return 0;
3696#endif
3697 return 0;
3698}
3699
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003700PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003701PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003702 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003703{
3704 wchar_t smallbuf[256];
3705 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3706 wchar_t *wstr;
3707 size_t wlen, wlen2;
3708 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003709 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003710 size_t error_pos;
3711 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003712 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3713 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003714
3715 if (locale_error_handler(errors, &surrogateescape) < 0)
3716 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003717
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003718 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3719 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003720 return NULL;
3721 }
3722
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003723 if (surrogateescape) {
3724 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003725 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003726 if (wstr == NULL) {
3727 if (wlen == (size_t)-1)
3728 PyErr_NoMemory();
3729 else
3730 PyErr_SetFromErrno(PyExc_OSError);
3731 return NULL;
3732 }
3733
3734 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003735 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003736 }
3737 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003738 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003739#ifndef HAVE_BROKEN_MBSTOWCS
3740 wlen = mbstowcs(NULL, str, 0);
3741#else
3742 wlen = len;
3743#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003744 if (wlen == (size_t)-1)
3745 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003746 if (wlen+1 <= smallbuf_len) {
3747 wstr = smallbuf;
3748 }
3749 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003750 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003751 if (!wstr)
3752 return PyErr_NoMemory();
3753 }
3754
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003755 wlen2 = mbstowcs(wstr, str, wlen+1);
3756 if (wlen2 == (size_t)-1) {
3757 if (wstr != smallbuf)
3758 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003759 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003760 }
3761#ifdef HAVE_BROKEN_MBSTOWCS
3762 assert(wlen2 == wlen);
3763#endif
3764 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3765 if (wstr != smallbuf)
3766 PyMem_Free(wstr);
3767 }
3768 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003769
3770decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003771 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003772 errmsg = strerror(errno);
3773 assert(errmsg != NULL);
3774
3775 error_pos = mbstowcs_errorpos(str, len);
3776 if (errmsg != NULL) {
3777 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003778 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003779 if (wstr != NULL) {
3780 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003781 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003782 }
Victor Stinner2f197072011-12-17 07:08:30 +01003783 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003784 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003785 reason = PyUnicode_FromString(
3786 "mbstowcs() encountered an invalid multibyte sequence");
3787 if (reason == NULL)
3788 return NULL;
3789
3790 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3791 "locale", str, len,
3792 (Py_ssize_t)error_pos,
3793 (Py_ssize_t)(error_pos+1),
3794 reason);
3795 Py_DECREF(reason);
3796 if (exc != NULL) {
3797 PyCodec_StrictErrors(exc);
3798 Py_XDECREF(exc);
3799 }
3800 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003801}
3802
3803PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003804PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003805{
3806 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003807 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003808}
3809
3810
3811PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003812PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003813 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003814 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3815}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003816
Christian Heimes5894ba72007-11-04 11:43:14 +00003817PyObject*
3818PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3819{
Steve Dowercc16be82016-09-08 10:35:16 -07003820#if defined(__APPLE__)
3821 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003822#else
Victor Stinner793b5312011-04-27 00:24:21 +02003823 PyInterpreterState *interp = PyThreadState_GET()->interp;
3824 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3825 cannot use it to encode and decode filenames before it is loaded. Load
3826 the Python codec requires to encode at least its own filename. Use the C
3827 version of the locale codec until the codec registry is initialized and
3828 the Python codec is loaded.
3829
3830 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3831 cannot only rely on it: check also interp->fscodec_initialized for
3832 subinterpreters. */
3833 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dowercc16be82016-09-08 10:35:16 -07003834 PyObject *res = PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003835 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003836 Py_FileSystemDefaultEncodeErrors);
3837#ifdef MS_WINDOWS
3838 if (!res && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
3839 PyObject *exc, *val, *tb;
3840 PyErr_Fetch(&exc, &val, &tb);
3841 PyErr_Format(PyExc_RuntimeError,
3842 "filesystem path bytes were not correctly encoded with '%s'. " \
3843 "Please report this at http://bugs.python.org/issue27781",
3844 Py_FileSystemDefaultEncoding);
3845 _PyErr_ChainExceptions(exc, val, tb);
3846 }
3847#endif
3848 return res;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003849 }
3850 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003851 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003852 }
Victor Stinnerad158722010-10-27 00:25:46 +00003853#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003854}
3855
Martin v. Löwis011e8422009-05-05 04:43:17 +00003856
3857int
3858PyUnicode_FSConverter(PyObject* arg, void* addr)
3859{
Brett Cannonec6ce872016-09-06 15:50:29 -07003860 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003861 PyObject *output = NULL;
3862 Py_ssize_t size;
3863 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003864 if (arg == NULL) {
3865 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003866 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003867 return 1;
3868 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003869 path = PyOS_FSPath(arg);
3870 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003871 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003872 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003873 if (PyBytes_Check(path)) {
3874 output = path;
3875 }
3876 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3877 output = PyUnicode_EncodeFSDefault(path);
3878 Py_DECREF(path);
3879 if (!output) {
3880 return 0;
3881 }
3882 assert(PyBytes_Check(output));
3883 }
3884
Victor Stinner0ea2a462010-04-30 00:22:08 +00003885 size = PyBytes_GET_SIZE(output);
3886 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003887 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003888 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003889 Py_DECREF(output);
3890 return 0;
3891 }
3892 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003893 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003894}
3895
3896
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003897int
3898PyUnicode_FSDecoder(PyObject* arg, void* addr)
3899{
Brett Cannona5711202016-09-06 19:36:01 -07003900 int is_buffer = 0;
3901 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003902 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003903 if (arg == NULL) {
3904 Py_DECREF(*(PyObject**)addr);
3905 return 1;
3906 }
Brett Cannona5711202016-09-06 19:36:01 -07003907
3908 is_buffer = PyObject_CheckBuffer(arg);
3909 if (!is_buffer) {
3910 path = PyOS_FSPath(arg);
3911 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003912 return 0;
3913 }
Brett Cannona5711202016-09-06 19:36:01 -07003914 }
3915 else {
3916 path = arg;
3917 Py_INCREF(arg);
3918 }
3919
3920 if (PyUnicode_Check(path)) {
3921 if (PyUnicode_READY(path) == -1) {
3922 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003923 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003924 }
3925 output = path;
3926 }
3927 else if (PyBytes_Check(path) || is_buffer) {
3928 PyObject *path_bytes = NULL;
3929
3930 if (!PyBytes_Check(path) &&
3931 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3932 "path should be string, bytes, or os.PathLike, not %.200s",
3933 Py_TYPE(arg)->tp_name)) {
3934 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003935 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003936 }
3937 path_bytes = PyBytes_FromObject(path);
3938 Py_DECREF(path);
3939 if (!path_bytes) {
3940 return 0;
3941 }
3942 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3943 PyBytes_GET_SIZE(path_bytes));
3944 Py_DECREF(path_bytes);
3945 if (!output) {
3946 return 0;
3947 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003948 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003949 else {
3950 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003951 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003952 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003953 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003954 return 0;
3955 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003956 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003957 Py_DECREF(output);
3958 return 0;
3959 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003961 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003962 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003963 Py_DECREF(output);
3964 return 0;
3965 }
3966 *(PyObject**)addr = output;
3967 return Py_CLEANUP_SUPPORTED;
3968}
3969
3970
Martin v. Löwis5b222132007-06-10 09:51:05 +00003971char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003972PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003973{
Christian Heimesf3863112007-11-22 07:46:41 +00003974 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003975
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003976 if (!PyUnicode_Check(unicode)) {
3977 PyErr_BadArgument();
3978 return NULL;
3979 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003980 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003981 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003982
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003983 if (PyUnicode_UTF8(unicode) == NULL) {
3984 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003985 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003986 if (bytes == NULL)
3987 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003988 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3989 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003990 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003991 Py_DECREF(bytes);
3992 return NULL;
3993 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003994 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003995 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003996 PyBytes_AS_STRING(bytes),
3997 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998 Py_DECREF(bytes);
3999 }
4000
4001 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004002 *psize = PyUnicode_UTF8_LENGTH(unicode);
4003 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004004}
4005
4006char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004008{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004009 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4010}
4011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012Py_UNICODE *
4013PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4014{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004015 const unsigned char *one_byte;
4016#if SIZEOF_WCHAR_T == 4
4017 const Py_UCS2 *two_bytes;
4018#else
4019 const Py_UCS4 *four_bytes;
4020 const Py_UCS4 *ucs4_end;
4021 Py_ssize_t num_surrogates;
4022#endif
4023 wchar_t *w;
4024 wchar_t *wchar_end;
4025
4026 if (!PyUnicode_Check(unicode)) {
4027 PyErr_BadArgument();
4028 return NULL;
4029 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004030 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004031 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004032 assert(_PyUnicode_KIND(unicode) != 0);
4033 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004034
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004035 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004036#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004037 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4038 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004039 num_surrogates = 0;
4040
4041 for (; four_bytes < ucs4_end; ++four_bytes) {
4042 if (*four_bytes > 0xFFFF)
4043 ++num_surrogates;
4044 }
4045
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004046 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4047 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4048 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004049 PyErr_NoMemory();
4050 return NULL;
4051 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004052 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004053
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004054 w = _PyUnicode_WSTR(unicode);
4055 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4056 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4058 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004059 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004060 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004061 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4062 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004063 }
4064 else
4065 *w = *four_bytes;
4066
4067 if (w > wchar_end) {
4068 assert(0 && "Miscalculated string end");
4069 }
4070 }
4071 *w = 0;
4072#else
4073 /* sizeof(wchar_t) == 4 */
4074 Py_FatalError("Impossible unicode object state, wstr and str "
4075 "should share memory already.");
4076 return NULL;
4077#endif
4078 }
4079 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004080 if ((size_t)_PyUnicode_LENGTH(unicode) >
4081 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4082 PyErr_NoMemory();
4083 return NULL;
4084 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004085 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4086 (_PyUnicode_LENGTH(unicode) + 1));
4087 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004088 PyErr_NoMemory();
4089 return NULL;
4090 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004091 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4092 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4093 w = _PyUnicode_WSTR(unicode);
4094 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004095
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004096 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4097 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004098 for (; w < wchar_end; ++one_byte, ++w)
4099 *w = *one_byte;
4100 /* null-terminate the wstr */
4101 *w = 0;
4102 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004103 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004104#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004105 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004106 for (; w < wchar_end; ++two_bytes, ++w)
4107 *w = *two_bytes;
4108 /* null-terminate the wstr */
4109 *w = 0;
4110#else
4111 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004112 PyObject_FREE(_PyUnicode_WSTR(unicode));
4113 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004114 Py_FatalError("Impossible unicode object state, wstr "
4115 "and str should share memory already.");
4116 return NULL;
4117#endif
4118 }
4119 else {
4120 assert(0 && "This should never happen.");
4121 }
4122 }
4123 }
4124 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004125 *size = PyUnicode_WSTR_LENGTH(unicode);
4126 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004127}
4128
Alexander Belopolsky40018472011-02-26 01:02:56 +00004129Py_UNICODE *
4130PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004132 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133}
4134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004135
Alexander Belopolsky40018472011-02-26 01:02:56 +00004136Py_ssize_t
4137PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138{
4139 if (!PyUnicode_Check(unicode)) {
4140 PyErr_BadArgument();
4141 goto onError;
4142 }
4143 return PyUnicode_GET_SIZE(unicode);
4144
Benjamin Peterson29060642009-01-31 22:14:21 +00004145 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146 return -1;
4147}
4148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004149Py_ssize_t
4150PyUnicode_GetLength(PyObject *unicode)
4151{
Victor Stinner07621332012-06-16 04:53:46 +02004152 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004153 PyErr_BadArgument();
4154 return -1;
4155 }
Victor Stinner07621332012-06-16 04:53:46 +02004156 if (PyUnicode_READY(unicode) == -1)
4157 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004158 return PyUnicode_GET_LENGTH(unicode);
4159}
4160
4161Py_UCS4
4162PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4163{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004164 void *data;
4165 int kind;
4166
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004167 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4168 PyErr_BadArgument();
4169 return (Py_UCS4)-1;
4170 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004171 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004172 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004173 return (Py_UCS4)-1;
4174 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004175 data = PyUnicode_DATA(unicode);
4176 kind = PyUnicode_KIND(unicode);
4177 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004178}
4179
4180int
4181PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4182{
4183 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004184 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004185 return -1;
4186 }
Victor Stinner488fa492011-12-12 00:01:39 +01004187 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004188 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004189 PyErr_SetString(PyExc_IndexError, "string index out of range");
4190 return -1;
4191 }
Victor Stinner488fa492011-12-12 00:01:39 +01004192 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004193 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004194 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4195 PyErr_SetString(PyExc_ValueError, "character out of range");
4196 return -1;
4197 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004198 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4199 index, ch);
4200 return 0;
4201}
4202
Alexander Belopolsky40018472011-02-26 01:02:56 +00004203const char *
4204PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004205{
Victor Stinner42cb4622010-09-01 19:39:01 +00004206 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004207}
4208
Victor Stinner554f3f02010-06-16 23:33:54 +00004209/* create or adjust a UnicodeDecodeError */
4210static void
4211make_decode_exception(PyObject **exceptionObject,
4212 const char *encoding,
4213 const char *input, Py_ssize_t length,
4214 Py_ssize_t startpos, Py_ssize_t endpos,
4215 const char *reason)
4216{
4217 if (*exceptionObject == NULL) {
4218 *exceptionObject = PyUnicodeDecodeError_Create(
4219 encoding, input, length, startpos, endpos, reason);
4220 }
4221 else {
4222 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4223 goto onError;
4224 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4225 goto onError;
4226 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4227 goto onError;
4228 }
4229 return;
4230
4231onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004232 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004233}
4234
Steve Dowercc16be82016-09-08 10:35:16 -07004235#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236/* error handling callback helper:
4237 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004238 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239 and adjust various state variables.
4240 return 0 on success, -1 on error
4241*/
4242
Alexander Belopolsky40018472011-02-26 01:02:56 +00004243static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004244unicode_decode_call_errorhandler_wchar(
4245 const char *errors, PyObject **errorHandler,
4246 const char *encoding, const char *reason,
4247 const char **input, const char **inend, Py_ssize_t *startinpos,
4248 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4249 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004250{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004251 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252
4253 PyObject *restuple = NULL;
4254 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004255 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004256 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004257 Py_ssize_t requiredsize;
4258 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004259 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004260 wchar_t *repwstr;
4261 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004263 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4264 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004265
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004266 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004267 *errorHandler = PyCodec_LookupError(errors);
4268 if (*errorHandler == NULL)
4269 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270 }
4271
Victor Stinner554f3f02010-06-16 23:33:54 +00004272 make_decode_exception(exceptionObject,
4273 encoding,
4274 *input, *inend - *input,
4275 *startinpos, *endinpos,
4276 reason);
4277 if (*exceptionObject == NULL)
4278 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279
4280 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4281 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004284 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004285 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 }
4287 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004289
4290 /* Copy back the bytes variables, which might have been modified by the
4291 callback */
4292 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4293 if (!inputobj)
4294 goto onError;
4295 if (!PyBytes_Check(inputobj)) {
4296 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4297 }
4298 *input = PyBytes_AS_STRING(inputobj);
4299 insize = PyBytes_GET_SIZE(inputobj);
4300 *inend = *input + insize;
4301 /* we can DECREF safely, as the exception has another reference,
4302 so the object won't go away. */
4303 Py_DECREF(inputobj);
4304
4305 if (newpos<0)
4306 newpos = insize+newpos;
4307 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004308 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004309 goto onError;
4310 }
4311
4312 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4313 if (repwstr == NULL)
4314 goto onError;
4315 /* need more space? (at least enough for what we
4316 have+the replacement+the rest of the string (starting
4317 at the new input position), so we won't have to check space
4318 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004319 requiredsize = *outpos;
4320 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4321 goto overflow;
4322 requiredsize += repwlen;
4323 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4324 goto overflow;
4325 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004326 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004327 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004328 requiredsize = 2*outsize;
4329 if (unicode_resize(output, requiredsize) < 0)
4330 goto onError;
4331 }
4332 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4333 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004334 *endinpos = newpos;
4335 *inptr = *input + newpos;
4336
4337 /* we made it! */
4338 Py_XDECREF(restuple);
4339 return 0;
4340
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004341 overflow:
4342 PyErr_SetString(PyExc_OverflowError,
4343 "decoded result is too long for a Python string");
4344
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004345 onError:
4346 Py_XDECREF(restuple);
4347 return -1;
4348}
Steve Dowercc16be82016-09-08 10:35:16 -07004349#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004350
4351static int
4352unicode_decode_call_errorhandler_writer(
4353 const char *errors, PyObject **errorHandler,
4354 const char *encoding, const char *reason,
4355 const char **input, const char **inend, Py_ssize_t *startinpos,
4356 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4357 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4358{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004359 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004360
4361 PyObject *restuple = NULL;
4362 PyObject *repunicode = NULL;
4363 Py_ssize_t insize;
4364 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004365 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004366 PyObject *inputobj = NULL;
4367
4368 if (*errorHandler == NULL) {
4369 *errorHandler = PyCodec_LookupError(errors);
4370 if (*errorHandler == NULL)
4371 goto onError;
4372 }
4373
4374 make_decode_exception(exceptionObject,
4375 encoding,
4376 *input, *inend - *input,
4377 *startinpos, *endinpos,
4378 reason);
4379 if (*exceptionObject == NULL)
4380 goto onError;
4381
4382 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4383 if (restuple == NULL)
4384 goto onError;
4385 if (!PyTuple_Check(restuple)) {
4386 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4387 goto onError;
4388 }
4389 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004390 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004391
4392 /* Copy back the bytes variables, which might have been modified by the
4393 callback */
4394 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4395 if (!inputobj)
4396 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004397 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004398 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004399 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004400 *input = PyBytes_AS_STRING(inputobj);
4401 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004402 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004403 /* we can DECREF safely, as the exception has another reference,
4404 so the object won't go away. */
4405 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004406
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004408 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004409 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004410 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004411 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004412 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413
Victor Stinner8f674cc2013-04-17 23:02:17 +02004414 if (PyUnicode_READY(repunicode) < 0)
4415 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004416 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004417 if (replen > 1) {
4418 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004419 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004420 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4421 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4422 goto onError;
4423 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004424 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004425 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004426
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004428 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004429
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004431 Py_XDECREF(restuple);
4432 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004436 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437}
4438
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439/* --- UTF-7 Codec -------------------------------------------------------- */
4440
Antoine Pitrou244651a2009-05-04 18:56:13 +00004441/* See RFC2152 for details. We encode conservatively and decode liberally. */
4442
4443/* Three simple macros defining base-64. */
4444
4445/* Is c a base-64 character? */
4446
4447#define IS_BASE64(c) \
4448 (((c) >= 'A' && (c) <= 'Z') || \
4449 ((c) >= 'a' && (c) <= 'z') || \
4450 ((c) >= '0' && (c) <= '9') || \
4451 (c) == '+' || (c) == '/')
4452
4453/* given that c is a base-64 character, what is its base-64 value? */
4454
4455#define FROM_BASE64(c) \
4456 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4457 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4458 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4459 (c) == '+' ? 62 : 63)
4460
4461/* What is the base-64 character of the bottom 6 bits of n? */
4462
4463#define TO_BASE64(n) \
4464 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4465
4466/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4467 * decoded as itself. We are permissive on decoding; the only ASCII
4468 * byte not decoding to itself is the + which begins a base64
4469 * string. */
4470
4471#define DECODE_DIRECT(c) \
4472 ((c) <= 127 && (c) != '+')
4473
4474/* The UTF-7 encoder treats ASCII characters differently according to
4475 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4476 * the above). See RFC2152. This array identifies these different
4477 * sets:
4478 * 0 : "Set D"
4479 * alphanumeric and '(),-./:?
4480 * 1 : "Set O"
4481 * !"#$%&*;<=>@[]^_`{|}
4482 * 2 : "whitespace"
4483 * ht nl cr sp
4484 * 3 : special (must be base64 encoded)
4485 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4486 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004487
Tim Petersced69f82003-09-16 20:30:58 +00004488static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004489char utf7_category[128] = {
4490/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4491 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4492/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4493 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4494/* sp ! " # $ % & ' ( ) * + , - . / */
4495 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4496/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4497 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4498/* @ A B C D E F G H I J K L M N O */
4499 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4500/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4501 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4502/* ` a b c d e f g h i j k l m n o */
4503 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4504/* p q r s t u v w x y z { | } ~ del */
4505 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004506};
4507
Antoine Pitrou244651a2009-05-04 18:56:13 +00004508/* ENCODE_DIRECT: this character should be encoded as itself. The
4509 * answer depends on whether we are encoding set O as itself, and also
4510 * on whether we are encoding whitespace as itself. RFC2152 makes it
4511 * clear that the answers to these questions vary between
4512 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004513
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514#define ENCODE_DIRECT(c, directO, directWS) \
4515 ((c) < 128 && (c) > 0 && \
4516 ((utf7_category[(c)] == 0) || \
4517 (directWS && (utf7_category[(c)] == 2)) || \
4518 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519
Alexander Belopolsky40018472011-02-26 01:02:56 +00004520PyObject *
4521PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004522 Py_ssize_t size,
4523 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004524{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004525 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4526}
4527
Antoine Pitrou244651a2009-05-04 18:56:13 +00004528/* The decoder. The only state we preserve is our read position,
4529 * i.e. how many characters we have consumed. So if we end in the
4530 * middle of a shift sequence we have to back off the read position
4531 * and the output to the beginning of the sequence, otherwise we lose
4532 * all the shift state (seen bits, number of bits seen, high
4533 * surrogate). */
4534
Alexander Belopolsky40018472011-02-26 01:02:56 +00004535PyObject *
4536PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004537 Py_ssize_t size,
4538 const char *errors,
4539 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004540{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004542 Py_ssize_t startinpos;
4543 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004545 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004546 const char *errmsg = "";
4547 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004548 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004549 unsigned int base64bits = 0;
4550 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004551 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 PyObject *errorHandler = NULL;
4553 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004555 if (size == 0) {
4556 if (consumed)
4557 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004558 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004559 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004560
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004561 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004562 _PyUnicodeWriter_Init(&writer);
4563 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004564
4565 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004566 e = s + size;
4567
4568 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004569 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004571 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004572
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 if (inShift) { /* in a base-64 section */
4574 if (IS_BASE64(ch)) { /* consume a base-64 character */
4575 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4576 base64bits += 6;
4577 s++;
4578 if (base64bits >= 16) {
4579 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004580 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 base64bits -= 16;
4582 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004583 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 if (surrogate) {
4585 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004586 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4587 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004588 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004589 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004591 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592 }
4593 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004594 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004595 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004596 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 }
4598 }
Victor Stinner551ac952011-11-29 22:58:13 +01004599 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 /* first surrogate */
4601 surrogate = outCh;
4602 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004604 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004605 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 }
4607 }
4608 }
4609 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004610 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004611 if (base64bits > 0) { /* left-over bits */
4612 if (base64bits >= 6) {
4613 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004614 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 errmsg = "partial character in shift sequence";
4616 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004617 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 else {
4619 /* Some bits remain; they should be zero */
4620 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004621 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 errmsg = "non-zero padding bits in shift sequence";
4623 goto utf7Error;
4624 }
4625 }
4626 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004627 if (surrogate && DECODE_DIRECT(ch)) {
4628 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4629 goto onError;
4630 }
4631 surrogate = 0;
4632 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 /* '-' is absorbed; other terminating
4634 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004635 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004636 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637 }
4638 }
4639 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004641 s++; /* consume '+' */
4642 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004643 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004644 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004645 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004646 }
4647 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004649 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004650 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004651 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004652 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653 }
4654 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004655 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004657 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004658 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004659 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004660 else {
4661 startinpos = s-starts;
4662 s++;
4663 errmsg = "unexpected special character";
4664 goto utf7Error;
4665 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004666 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004669 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004670 errors, &errorHandler,
4671 "utf7", errmsg,
4672 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004673 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004675 }
4676
Antoine Pitrou244651a2009-05-04 18:56:13 +00004677 /* end of string */
4678
4679 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4680 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004681 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004682 if (surrogate ||
4683 (base64bits >= 6) ||
4684 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004685 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004686 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004687 errors, &errorHandler,
4688 "utf7", "unterminated shift sequence",
4689 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004690 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004691 goto onError;
4692 if (s < e)
4693 goto restart;
4694 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004695 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004696
4697 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004698 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004700 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004701 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004702 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004703 writer.kind, writer.data, shiftOutStart);
4704 Py_XDECREF(errorHandler);
4705 Py_XDECREF(exc);
4706 _PyUnicodeWriter_Dealloc(&writer);
4707 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004708 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004709 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004710 }
4711 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004712 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004713 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004714 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004715
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004716 Py_XDECREF(errorHandler);
4717 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004718 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004719
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004721 Py_XDECREF(errorHandler);
4722 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004723 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004724 return NULL;
4725}
4726
4727
Alexander Belopolsky40018472011-02-26 01:02:56 +00004728PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004729_PyUnicode_EncodeUTF7(PyObject *str,
4730 int base64SetO,
4731 int base64WhiteSpace,
4732 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004733{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004734 int kind;
4735 void *data;
4736 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004737 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004738 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004739 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004740 unsigned int base64bits = 0;
4741 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004742 char * out;
4743 char * start;
4744
Benjamin Petersonbac79492012-01-14 13:34:47 -05004745 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004746 return NULL;
4747 kind = PyUnicode_KIND(str);
4748 data = PyUnicode_DATA(str);
4749 len = PyUnicode_GET_LENGTH(str);
4750
4751 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004753
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004754 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004755 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004756 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004757 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004758 if (v == NULL)
4759 return NULL;
4760
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004761 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004762 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004763 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004764
Antoine Pitrou244651a2009-05-04 18:56:13 +00004765 if (inShift) {
4766 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4767 /* shifting out */
4768 if (base64bits) { /* output remaining bits */
4769 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4770 base64buffer = 0;
4771 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004772 }
4773 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004774 /* Characters not in the BASE64 set implicitly unshift the sequence
4775 so no '-' is required, except if the character is itself a '-' */
4776 if (IS_BASE64(ch) || ch == '-') {
4777 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004778 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004779 *out++ = (char) ch;
4780 }
4781 else {
4782 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004783 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004784 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004785 else { /* not in a shift sequence */
4786 if (ch == '+') {
4787 *out++ = '+';
4788 *out++ = '-';
4789 }
4790 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4791 *out++ = (char) ch;
4792 }
4793 else {
4794 *out++ = '+';
4795 inShift = 1;
4796 goto encode_char;
4797 }
4798 }
4799 continue;
4800encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004802 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004803
Antoine Pitrou244651a2009-05-04 18:56:13 +00004804 /* code first surrogate */
4805 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004806 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004807 while (base64bits >= 6) {
4808 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4809 base64bits -= 6;
4810 }
4811 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004812 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004813 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004814 base64bits += 16;
4815 base64buffer = (base64buffer << 16) | ch;
4816 while (base64bits >= 6) {
4817 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4818 base64bits -= 6;
4819 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004820 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004821 if (base64bits)
4822 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4823 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004824 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004825 if (_PyBytes_Resize(&v, out - start) < 0)
4826 return NULL;
4827 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004828}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004829PyObject *
4830PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4831 Py_ssize_t size,
4832 int base64SetO,
4833 int base64WhiteSpace,
4834 const char *errors)
4835{
4836 PyObject *result;
4837 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4838 if (tmp == NULL)
4839 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004840 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004841 base64WhiteSpace, errors);
4842 Py_DECREF(tmp);
4843 return result;
4844}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004845
Antoine Pitrou244651a2009-05-04 18:56:13 +00004846#undef IS_BASE64
4847#undef FROM_BASE64
4848#undef TO_BASE64
4849#undef DECODE_DIRECT
4850#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004851
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852/* --- UTF-8 Codec -------------------------------------------------------- */
4853
Alexander Belopolsky40018472011-02-26 01:02:56 +00004854PyObject *
4855PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004856 Py_ssize_t size,
4857 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858{
Walter Dörwald69652032004-09-07 20:24:22 +00004859 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4860}
4861
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004862#include "stringlib/asciilib.h"
4863#include "stringlib/codecs.h"
4864#include "stringlib/undef.h"
4865
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004866#include "stringlib/ucs1lib.h"
4867#include "stringlib/codecs.h"
4868#include "stringlib/undef.h"
4869
4870#include "stringlib/ucs2lib.h"
4871#include "stringlib/codecs.h"
4872#include "stringlib/undef.h"
4873
4874#include "stringlib/ucs4lib.h"
4875#include "stringlib/codecs.h"
4876#include "stringlib/undef.h"
4877
Antoine Pitrouab868312009-01-10 15:40:25 +00004878/* Mask to quickly check whether a C 'long' contains a
4879 non-ASCII, UTF8-encoded char. */
4880#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004881# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004882#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004883# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004884#else
4885# error C 'long' size should be either 4 or 8!
4886#endif
4887
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888static Py_ssize_t
4889ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004890{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004891 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004892 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004893
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004894 /*
4895 * Issue #17237: m68k is a bit different from most architectures in
4896 * that objects do not use "natural alignment" - for example, int and
4897 * long are only aligned at 2-byte boundaries. Therefore the assert()
4898 * won't work; also, tests have shown that skipping the "optimised
4899 * version" will even speed up m68k.
4900 */
4901#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004902#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004903 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4904 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 /* Fast path, see in STRINGLIB(utf8_decode) for
4906 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004907 /* Help allocation */
4908 const char *_p = p;
4909 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004910 while (_p < aligned_end) {
4911 unsigned long value = *(const unsigned long *) _p;
4912 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004913 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914 *((unsigned long *)q) = value;
4915 _p += SIZEOF_LONG;
4916 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004917 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004918 p = _p;
4919 while (p < end) {
4920 if ((unsigned char)*p & 0x80)
4921 break;
4922 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004924 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004926#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004927#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004928 while (p < end) {
4929 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4930 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004931 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004932 /* Help allocation */
4933 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004934 while (_p < aligned_end) {
4935 unsigned long value = *(unsigned long *) _p;
4936 if (value & ASCII_CHAR_MASK)
4937 break;
4938 _p += SIZEOF_LONG;
4939 }
4940 p = _p;
4941 if (_p == end)
4942 break;
4943 }
4944 if ((unsigned char)*p & 0x80)
4945 break;
4946 ++p;
4947 }
4948 memcpy(dest, start, p - start);
4949 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950}
Antoine Pitrouab868312009-01-10 15:40:25 +00004951
Victor Stinner785938e2011-12-11 20:09:03 +01004952PyObject *
4953PyUnicode_DecodeUTF8Stateful(const char *s,
4954 Py_ssize_t size,
4955 const char *errors,
4956 Py_ssize_t *consumed)
4957{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004958 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004959 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004960 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961
4962 Py_ssize_t startinpos;
4963 Py_ssize_t endinpos;
4964 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004965 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004966 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004967 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004968
4969 if (size == 0) {
4970 if (consumed)
4971 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004972 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004973 }
4974
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004975 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4976 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004977 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004978 *consumed = 1;
4979 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004980 }
4981
Victor Stinner8f674cc2013-04-17 23:02:17 +02004982 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004983 writer.min_length = size;
4984 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004985 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004986
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004987 writer.pos = ascii_decode(s, end, writer.data);
4988 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 while (s < end) {
4990 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004991 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004992
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004993 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004994 if (PyUnicode_IS_ASCII(writer.buffer))
4995 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004996 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004997 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004998 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004999 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000 } else {
5001 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005002 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 }
5004
5005 switch (ch) {
5006 case 0:
5007 if (s == end || consumed)
5008 goto End;
5009 errmsg = "unexpected end of data";
5010 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005011 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012 break;
5013 case 1:
5014 errmsg = "invalid start byte";
5015 startinpos = s - starts;
5016 endinpos = startinpos + 1;
5017 break;
5018 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005019 case 3:
5020 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005021 errmsg = "invalid continuation byte";
5022 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005023 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005024 break;
5025 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005026 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005027 goto onError;
5028 continue;
5029 }
5030
Victor Stinner1d65d912015-10-05 13:43:50 +02005031 if (error_handler == _Py_ERROR_UNKNOWN)
5032 error_handler = get_error_handler(errors);
5033
5034 switch (error_handler) {
5035 case _Py_ERROR_IGNORE:
5036 s += (endinpos - startinpos);
5037 break;
5038
5039 case _Py_ERROR_REPLACE:
5040 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5041 goto onError;
5042 s += (endinpos - startinpos);
5043 break;
5044
5045 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005046 {
5047 Py_ssize_t i;
5048
Victor Stinner1d65d912015-10-05 13:43:50 +02005049 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5050 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005051 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005052 ch = (Py_UCS4)(unsigned char)(starts[i]);
5053 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5054 ch + 0xdc00);
5055 writer.pos++;
5056 }
5057 s += (endinpos - startinpos);
5058 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005059 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005060
5061 default:
5062 if (unicode_decode_call_errorhandler_writer(
5063 errors, &error_handler_obj,
5064 "utf-8", errmsg,
5065 &starts, &end, &startinpos, &endinpos, &exc, &s,
5066 &writer))
5067 goto onError;
5068 }
Victor Stinner785938e2011-12-11 20:09:03 +01005069 }
5070
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005071End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072 if (consumed)
5073 *consumed = s - starts;
5074
Victor Stinner1d65d912015-10-05 13:43:50 +02005075 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005076 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005077 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005078
5079onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005080 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005082 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005084}
5085
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005086#ifdef __APPLE__
5087
5088/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005089 used to decode the command line arguments on Mac OS X.
5090
5091 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005092 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005093
5094wchar_t*
5095_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5096{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005097 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005098 wchar_t *unicode;
5099 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005100
5101 /* Note: size will always be longer than the resulting Unicode
5102 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005103 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005104 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005105 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005106 if (!unicode)
5107 return NULL;
5108
5109 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005110 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005111 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005112 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005113 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005114#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005115 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005116#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005117 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005118#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005119 if (ch > 0xFF) {
5120#if SIZEOF_WCHAR_T == 4
5121 assert(0);
5122#else
5123 assert(Py_UNICODE_IS_SURROGATE(ch));
5124 /* compute and append the two surrogates: */
5125 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5126 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5127#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005128 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005129 else {
5130 if (!ch && s == e)
5131 break;
5132 /* surrogateescape */
5133 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5134 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005135 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005136 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005137 return unicode;
5138}
5139
5140#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005142/* Primary internal function which creates utf8 encoded bytes objects.
5143
5144 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005145 and allocate exactly as much space needed at the end. Else allocate the
5146 maximum possible needed (4 result bytes per Unicode character), and return
5147 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005148*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005149PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005150_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151{
Victor Stinner6099a032011-12-18 14:22:26 +01005152 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005153 void *data;
5154 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005156 if (!PyUnicode_Check(unicode)) {
5157 PyErr_BadArgument();
5158 return NULL;
5159 }
5160
5161 if (PyUnicode_READY(unicode) == -1)
5162 return NULL;
5163
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005164 if (PyUnicode_UTF8(unicode))
5165 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5166 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005167
5168 kind = PyUnicode_KIND(unicode);
5169 data = PyUnicode_DATA(unicode);
5170 size = PyUnicode_GET_LENGTH(unicode);
5171
Benjamin Petersonead6b532011-12-20 17:23:42 -06005172 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005173 default:
5174 assert(0);
5175 case PyUnicode_1BYTE_KIND:
5176 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5177 assert(!PyUnicode_IS_ASCII(unicode));
5178 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5179 case PyUnicode_2BYTE_KIND:
5180 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5181 case PyUnicode_4BYTE_KIND:
5182 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184}
5185
Alexander Belopolsky40018472011-02-26 01:02:56 +00005186PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005187PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5188 Py_ssize_t size,
5189 const char *errors)
5190{
5191 PyObject *v, *unicode;
5192
5193 unicode = PyUnicode_FromUnicode(s, size);
5194 if (unicode == NULL)
5195 return NULL;
5196 v = _PyUnicode_AsUTF8String(unicode, errors);
5197 Py_DECREF(unicode);
5198 return v;
5199}
5200
5201PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005202PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005204 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205}
5206
Walter Dörwald41980ca2007-08-16 21:55:45 +00005207/* --- UTF-32 Codec ------------------------------------------------------- */
5208
5209PyObject *
5210PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005211 Py_ssize_t size,
5212 const char *errors,
5213 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005214{
5215 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5216}
5217
5218PyObject *
5219PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 Py_ssize_t size,
5221 const char *errors,
5222 int *byteorder,
5223 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005224{
5225 const char *starts = s;
5226 Py_ssize_t startinpos;
5227 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005228 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005229 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005230 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005231 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005232 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005233 PyObject *errorHandler = NULL;
5234 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005235
Walter Dörwald41980ca2007-08-16 21:55:45 +00005236 q = (unsigned char *)s;
5237 e = q + size;
5238
5239 if (byteorder)
5240 bo = *byteorder;
5241
5242 /* Check for BOM marks (U+FEFF) in the input and adjust current
5243 byte order setting accordingly. In native mode, the leading BOM
5244 mark is skipped, in all other modes, it is copied to the output
5245 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005246 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005247 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005248 if (bom == 0x0000FEFF) {
5249 bo = -1;
5250 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005252 else if (bom == 0xFFFE0000) {
5253 bo = 1;
5254 q += 4;
5255 }
5256 if (byteorder)
5257 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005258 }
5259
Victor Stinnere64322e2012-10-30 23:12:47 +01005260 if (q == e) {
5261 if (consumed)
5262 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005263 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005264 }
5265
Victor Stinnere64322e2012-10-30 23:12:47 +01005266#ifdef WORDS_BIGENDIAN
5267 le = bo < 0;
5268#else
5269 le = bo <= 0;
5270#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005271 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005272
Victor Stinner8f674cc2013-04-17 23:02:17 +02005273 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005274 writer.min_length = (e - q + 3) / 4;
5275 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005276 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005277
Victor Stinnere64322e2012-10-30 23:12:47 +01005278 while (1) {
5279 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005280 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005281
Victor Stinnere64322e2012-10-30 23:12:47 +01005282 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005283 enum PyUnicode_Kind kind = writer.kind;
5284 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005285 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005286 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005287 if (le) {
5288 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005289 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005290 if (ch > maxch)
5291 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005292 if (kind != PyUnicode_1BYTE_KIND &&
5293 Py_UNICODE_IS_SURROGATE(ch))
5294 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005295 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005296 q += 4;
5297 } while (q <= last);
5298 }
5299 else {
5300 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005301 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005302 if (ch > maxch)
5303 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005304 if (kind != PyUnicode_1BYTE_KIND &&
5305 Py_UNICODE_IS_SURROGATE(ch))
5306 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005307 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005308 q += 4;
5309 } while (q <= last);
5310 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005311 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005312 }
5313
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005314 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005315 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005316 startinpos = ((const char *)q) - starts;
5317 endinpos = startinpos + 4;
5318 }
5319 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005320 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005321 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005322 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005324 startinpos = ((const char *)q) - starts;
5325 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005326 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005327 else {
5328 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005329 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005330 goto onError;
5331 q += 4;
5332 continue;
5333 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005334 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005335 startinpos = ((const char *)q) - starts;
5336 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005338
5339 /* The remaining input chars are ignored if the callback
5340 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005341 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005343 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005345 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005347 }
5348
Walter Dörwald41980ca2007-08-16 21:55:45 +00005349 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005351
Walter Dörwald41980ca2007-08-16 21:55:45 +00005352 Py_XDECREF(errorHandler);
5353 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005354 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005355
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005357 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005358 Py_XDECREF(errorHandler);
5359 Py_XDECREF(exc);
5360 return NULL;
5361}
5362
5363PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005364_PyUnicode_EncodeUTF32(PyObject *str,
5365 const char *errors,
5366 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005367{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005368 enum PyUnicode_Kind kind;
5369 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005370 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005371 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005372 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005373#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005374 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005375#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005376 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005377#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005378 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005379 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005380 PyObject *errorHandler = NULL;
5381 PyObject *exc = NULL;
5382 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005383
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005384 if (!PyUnicode_Check(str)) {
5385 PyErr_BadArgument();
5386 return NULL;
5387 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005388 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005389 return NULL;
5390 kind = PyUnicode_KIND(str);
5391 data = PyUnicode_DATA(str);
5392 len = PyUnicode_GET_LENGTH(str);
5393
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005394 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005395 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005396 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005397 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005398 if (v == NULL)
5399 return NULL;
5400
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005401 /* output buffer is 4-bytes aligned */
5402 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005403 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005404 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005405 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005406 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005407 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005408
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005409 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005410 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005411 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005412 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005413 else
5414 encoding = "utf-32";
5415
5416 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005417 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5418 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005419 }
5420
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005421 pos = 0;
5422 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005423 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005424
5425 if (kind == PyUnicode_2BYTE_KIND) {
5426 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5427 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005428 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005429 else {
5430 assert(kind == PyUnicode_4BYTE_KIND);
5431 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5432 &out, native_ordering);
5433 }
5434 if (pos == len)
5435 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005436
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005437 rep = unicode_encode_call_errorhandler(
5438 errors, &errorHandler,
5439 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005440 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005441 if (!rep)
5442 goto error;
5443
5444 if (PyBytes_Check(rep)) {
5445 repsize = PyBytes_GET_SIZE(rep);
5446 if (repsize & 3) {
5447 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005448 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005449 "surrogates not allowed");
5450 goto error;
5451 }
5452 moreunits = repsize / 4;
5453 }
5454 else {
5455 assert(PyUnicode_Check(rep));
5456 if (PyUnicode_READY(rep) < 0)
5457 goto error;
5458 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5459 if (!PyUnicode_IS_ASCII(rep)) {
5460 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005461 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005462 "surrogates not allowed");
5463 goto error;
5464 }
5465 }
5466
5467 /* four bytes are reserved for each surrogate */
5468 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005469 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005470 Py_ssize_t morebytes = 4 * (moreunits - 1);
5471 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5472 /* integer overflow */
5473 PyErr_NoMemory();
5474 goto error;
5475 }
5476 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5477 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005478 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005479 }
5480
5481 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005482 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005483 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005484 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005486 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5487 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005488 }
5489
5490 Py_CLEAR(rep);
5491 }
5492
5493 /* Cut back to size actually needed. This is necessary for, for example,
5494 encoding of a string containing isolated surrogates and the 'ignore'
5495 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005496 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005497 if (nsize != PyBytes_GET_SIZE(v))
5498 _PyBytes_Resize(&v, nsize);
5499 Py_XDECREF(errorHandler);
5500 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005501 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005502 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 error:
5504 Py_XDECREF(rep);
5505 Py_XDECREF(errorHandler);
5506 Py_XDECREF(exc);
5507 Py_XDECREF(v);
5508 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005509}
5510
Alexander Belopolsky40018472011-02-26 01:02:56 +00005511PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005512PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5513 Py_ssize_t size,
5514 const char *errors,
5515 int byteorder)
5516{
5517 PyObject *result;
5518 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5519 if (tmp == NULL)
5520 return NULL;
5521 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5522 Py_DECREF(tmp);
5523 return result;
5524}
5525
5526PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005527PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005528{
Victor Stinnerb960b342011-11-20 19:12:52 +01005529 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005530}
5531
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532/* --- UTF-16 Codec ------------------------------------------------------- */
5533
Tim Peters772747b2001-08-09 22:21:55 +00005534PyObject *
5535PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 Py_ssize_t size,
5537 const char *errors,
5538 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539{
Walter Dörwald69652032004-09-07 20:24:22 +00005540 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5541}
5542
5543PyObject *
5544PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005545 Py_ssize_t size,
5546 const char *errors,
5547 int *byteorder,
5548 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005549{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005550 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005551 Py_ssize_t startinpos;
5552 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005553 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005554 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005555 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005556 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005557 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005558 PyObject *errorHandler = NULL;
5559 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005560 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561
Tim Peters772747b2001-08-09 22:21:55 +00005562 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005563 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564
5565 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005566 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005568 /* Check for BOM marks (U+FEFF) in the input and adjust current
5569 byte order setting accordingly. In native mode, the leading BOM
5570 mark is skipped, in all other modes, it is copied to the output
5571 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005572 if (bo == 0 && size >= 2) {
5573 const Py_UCS4 bom = (q[1] << 8) | q[0];
5574 if (bom == 0xFEFF) {
5575 q += 2;
5576 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005577 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005578 else if (bom == 0xFFFE) {
5579 q += 2;
5580 bo = 1;
5581 }
5582 if (byteorder)
5583 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005584 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585
Antoine Pitrou63065d72012-05-15 23:48:04 +02005586 if (q == e) {
5587 if (consumed)
5588 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005589 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005590 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005591
Christian Heimes743e0cd2012-10-17 23:52:17 +02005592#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005593 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005594 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005595#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005596 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005597 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005598#endif
Tim Peters772747b2001-08-09 22:21:55 +00005599
Antoine Pitrou63065d72012-05-15 23:48:04 +02005600 /* Note: size will always be longer than the resulting Unicode
5601 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005602 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005603 writer.min_length = (e - q + 1) / 2;
5604 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005605 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005606
Antoine Pitrou63065d72012-05-15 23:48:04 +02005607 while (1) {
5608 Py_UCS4 ch = 0;
5609 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005610 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005612 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005613 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005614 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005615 native_ordering);
5616 else
5617 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005618 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005619 native_ordering);
5620 } else if (kind == PyUnicode_2BYTE_KIND) {
5621 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005622 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005623 native_ordering);
5624 } else {
5625 assert(kind == PyUnicode_4BYTE_KIND);
5626 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005627 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005628 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005629 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005630 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005631
Antoine Pitrou63065d72012-05-15 23:48:04 +02005632 switch (ch)
5633 {
5634 case 0:
5635 /* remaining byte at the end? (size should be even) */
5636 if (q == e || consumed)
5637 goto End;
5638 errmsg = "truncated data";
5639 startinpos = ((const char *)q) - starts;
5640 endinpos = ((const char *)e) - starts;
5641 break;
5642 /* The remaining input chars are ignored if the callback
5643 chooses to skip the input */
5644 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005645 q -= 2;
5646 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005647 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005648 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005649 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005650 endinpos = ((const char *)e) - starts;
5651 break;
5652 case 2:
5653 errmsg = "illegal encoding";
5654 startinpos = ((const char *)q) - 2 - starts;
5655 endinpos = startinpos + 2;
5656 break;
5657 case 3:
5658 errmsg = "illegal UTF-16 surrogate";
5659 startinpos = ((const char *)q) - 4 - starts;
5660 endinpos = startinpos + 2;
5661 break;
5662 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005663 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005664 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 continue;
5666 }
5667
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005668 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005669 errors,
5670 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005671 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005672 &starts,
5673 (const char **)&e,
5674 &startinpos,
5675 &endinpos,
5676 &exc,
5677 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005678 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005679 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 }
5681
Antoine Pitrou63065d72012-05-15 23:48:04 +02005682End:
Walter Dörwald69652032004-09-07 20:24:22 +00005683 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005685
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 Py_XDECREF(errorHandler);
5687 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005688 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005691 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005692 Py_XDECREF(errorHandler);
5693 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 return NULL;
5695}
5696
Tim Peters772747b2001-08-09 22:21:55 +00005697PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005698_PyUnicode_EncodeUTF16(PyObject *str,
5699 const char *errors,
5700 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005702 enum PyUnicode_Kind kind;
5703 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005704 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005705 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005706 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005707 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005708#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005709 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005710#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005711 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005712#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005713 const char *encoding;
5714 Py_ssize_t nsize, pos;
5715 PyObject *errorHandler = NULL;
5716 PyObject *exc = NULL;
5717 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005718
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005719 if (!PyUnicode_Check(str)) {
5720 PyErr_BadArgument();
5721 return NULL;
5722 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005723 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005724 return NULL;
5725 kind = PyUnicode_KIND(str);
5726 data = PyUnicode_DATA(str);
5727 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005728
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005729 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005730 if (kind == PyUnicode_4BYTE_KIND) {
5731 const Py_UCS4 *in = (const Py_UCS4 *)data;
5732 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005733 while (in < end) {
5734 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005735 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005736 }
5737 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005738 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005739 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005741 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005742 nsize = len + pairs + (byteorder == 0);
5743 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005744 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005748 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005749 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005750 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005751 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005752 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005753 }
5754 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005755 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005756 }
Tim Peters772747b2001-08-09 22:21:55 +00005757
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005758 if (kind == PyUnicode_1BYTE_KIND) {
5759 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5760 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005761 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005762
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005763 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005764 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005765 }
5766 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005767 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005768 }
5769 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005770 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005771 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005772
5773 pos = 0;
5774 while (pos < len) {
5775 Py_ssize_t repsize, moreunits;
5776
5777 if (kind == PyUnicode_2BYTE_KIND) {
5778 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5779 &out, native_ordering);
5780 }
5781 else {
5782 assert(kind == PyUnicode_4BYTE_KIND);
5783 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5784 &out, native_ordering);
5785 }
5786 if (pos == len)
5787 break;
5788
5789 rep = unicode_encode_call_errorhandler(
5790 errors, &errorHandler,
5791 encoding, "surrogates not allowed",
5792 str, &exc, pos, pos + 1, &pos);
5793 if (!rep)
5794 goto error;
5795
5796 if (PyBytes_Check(rep)) {
5797 repsize = PyBytes_GET_SIZE(rep);
5798 if (repsize & 1) {
5799 raise_encode_exception(&exc, encoding,
5800 str, pos - 1, pos,
5801 "surrogates not allowed");
5802 goto error;
5803 }
5804 moreunits = repsize / 2;
5805 }
5806 else {
5807 assert(PyUnicode_Check(rep));
5808 if (PyUnicode_READY(rep) < 0)
5809 goto error;
5810 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5811 if (!PyUnicode_IS_ASCII(rep)) {
5812 raise_encode_exception(&exc, encoding,
5813 str, pos - 1, pos,
5814 "surrogates not allowed");
5815 goto error;
5816 }
5817 }
5818
5819 /* two bytes are reserved for each surrogate */
5820 if (moreunits > 1) {
5821 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5822 Py_ssize_t morebytes = 2 * (moreunits - 1);
5823 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5824 /* integer overflow */
5825 PyErr_NoMemory();
5826 goto error;
5827 }
5828 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5829 goto error;
5830 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5831 }
5832
5833 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005834 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005835 out += moreunits;
5836 } else /* rep is unicode */ {
5837 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5838 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5839 &out, native_ordering);
5840 }
5841
5842 Py_CLEAR(rep);
5843 }
5844
5845 /* Cut back to size actually needed. This is necessary for, for example,
5846 encoding of a string containing isolated surrogates and the 'ignore' handler
5847 is used. */
5848 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5849 if (nsize != PyBytes_GET_SIZE(v))
5850 _PyBytes_Resize(&v, nsize);
5851 Py_XDECREF(errorHandler);
5852 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005853 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005854 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005855 error:
5856 Py_XDECREF(rep);
5857 Py_XDECREF(errorHandler);
5858 Py_XDECREF(exc);
5859 Py_XDECREF(v);
5860 return NULL;
5861#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862}
5863
Alexander Belopolsky40018472011-02-26 01:02:56 +00005864PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005865PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5866 Py_ssize_t size,
5867 const char *errors,
5868 int byteorder)
5869{
5870 PyObject *result;
5871 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5872 if (tmp == NULL)
5873 return NULL;
5874 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5875 Py_DECREF(tmp);
5876 return result;
5877}
5878
5879PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005880PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005882 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883}
5884
5885/* --- Unicode Escape Codec ----------------------------------------------- */
5886
Fredrik Lundh06d12682001-01-24 07:59:11 +00005887static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005888
Alexander Belopolsky40018472011-02-26 01:02:56 +00005889PyObject *
5890PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005891 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005892 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005894 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005895 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005897 PyObject *errorHandler = NULL;
5898 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005899
Victor Stinner62ec3312016-09-06 17:04:34 -07005900 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005901 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005902 }
5903 /* Escaped strings will always be longer than the resulting
5904 Unicode string, so we start with size here and then reduce the
5905 length after conversion to the true value.
5906 (but if the error callback returns a long replacement string
5907 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005908 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005909 writer.min_length = size;
5910 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5911 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005912 }
5913
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 end = s + size;
5915 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005916 unsigned char c = (unsigned char) *s++;
5917 Py_UCS4 ch;
5918 int count;
5919 Py_ssize_t startinpos;
5920 Py_ssize_t endinpos;
5921 const char *message;
5922
5923#define WRITE_ASCII_CHAR(ch) \
5924 do { \
5925 assert(ch <= 127); \
5926 assert(writer.pos < writer.size); \
5927 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5928 } while(0)
5929
5930#define WRITE_CHAR(ch) \
5931 do { \
5932 if (ch <= writer.maxchar) { \
5933 assert(writer.pos < writer.size); \
5934 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5935 } \
5936 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5937 goto onError; \
5938 } \
5939 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940
5941 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005942 if (c != '\\') {
5943 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 continue;
5945 }
5946
Victor Stinner62ec3312016-09-06 17:04:34 -07005947 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005949 if (s >= end) {
5950 message = "\\ at end of string";
5951 goto error;
5952 }
5953 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005954
Victor Stinner62ec3312016-09-06 17:04:34 -07005955 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005956 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957
Benjamin Peterson29060642009-01-31 22:14:21 +00005958 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005959 case '\n': continue;
5960 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5961 case '\'': WRITE_ASCII_CHAR('\''); continue;
5962 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5963 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005964 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005965 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5966 case 't': WRITE_ASCII_CHAR('\t'); continue;
5967 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5968 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005969 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005970 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005971 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005972 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 case '0': case '1': case '2': case '3':
5976 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005977 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005978 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005979 ch = (ch<<3) + *s++ - '0';
5980 if (s < end && '0' <= *s && *s <= '7') {
5981 ch = (ch<<3) + *s++ - '0';
5982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005984 WRITE_CHAR(ch);
5985 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 /* hex escapes */
5988 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07005990 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005991 message = "truncated \\xXX escape";
5992 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07005996 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005997 message = "truncated \\uXXXX escape";
5998 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006001 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006002 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006003 message = "truncated \\UXXXXXXXX escape";
6004 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006005 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006006 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006007 ch <<= 4;
6008 if (c >= '0' && c <= '9') {
6009 ch += c - '0';
6010 }
6011 else if (c >= 'a' && c <= 'f') {
6012 ch += c - ('a' - 10);
6013 }
6014 else if (c >= 'A' && c <= 'F') {
6015 ch += c - ('A' - 10);
6016 }
6017 else {
6018 break;
6019 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006020 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006021 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006022 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006023 }
6024
6025 /* when we get here, ch is a 32-bit unicode character */
6026 if (ch > MAX_UNICODE) {
6027 message = "illegal Unicode character";
6028 goto error;
6029 }
6030
6031 WRITE_CHAR(ch);
6032 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006033
Benjamin Peterson29060642009-01-31 22:14:21 +00006034 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006035 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006036 if (ucnhash_CAPI == NULL) {
6037 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006038 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6039 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006040 if (ucnhash_CAPI == NULL) {
6041 PyErr_SetString(
6042 PyExc_UnicodeError,
6043 "\\N escapes not supported (can't load unicodedata module)"
6044 );
6045 goto onError;
6046 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006047 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006048
6049 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006050 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006051 const char *start = ++s;
6052 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006053 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006054 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006055 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006056 namelen = s - start;
6057 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006058 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006059 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006060 ch = 0xffffffff; /* in case 'getcode' messes up */
6061 if (namelen <= INT_MAX &&
6062 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6063 &ch, 0)) {
6064 assert(ch <= MAX_UNICODE);
6065 WRITE_CHAR(ch);
6066 continue;
6067 }
6068 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006069 }
6070 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006071 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006072
6073 default:
R David Murray110b6fe2016-09-08 15:34:08 -04006074 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6075 "invalid escape sequence '\\%c'", c) < 0)
6076 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006077 WRITE_ASCII_CHAR('\\');
6078 WRITE_CHAR(c);
6079 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006081
6082 error:
6083 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006084 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006085 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006086 errors, &errorHandler,
6087 "unicodeescape", message,
6088 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006089 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006090 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006091 }
6092 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6093 goto onError;
6094 }
6095
6096#undef WRITE_ASCII_CHAR
6097#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006099
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006100 Py_XDECREF(errorHandler);
6101 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006102 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006103
Benjamin Peterson29060642009-01-31 22:14:21 +00006104 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006105 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006106 Py_XDECREF(errorHandler);
6107 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 return NULL;
6109}
6110
6111/* Return a Unicode-Escape string version of the Unicode object.
6112
6113 If quotes is true, the string is enclosed in u"" or u'' quotes as
6114 appropriate.
6115
6116*/
6117
Alexander Belopolsky40018472011-02-26 01:02:56 +00006118PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006119PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006121 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006122 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006124 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006125 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006126 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127
Ezio Melottie7f90372012-10-05 03:33:31 +03006128 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006129 escape.
6130
Ezio Melottie7f90372012-10-05 03:33:31 +03006131 For UCS1 strings it's '\xxx', 4 bytes per source character.
6132 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6133 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006134 */
6135
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006136 if (!PyUnicode_Check(unicode)) {
6137 PyErr_BadArgument();
6138 return NULL;
6139 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006140 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006141 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006142 }
Victor Stinner358af132015-10-12 22:36:57 +02006143
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006144 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006145 if (len == 0) {
6146 return PyBytes_FromStringAndSize(NULL, 0);
6147 }
6148
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006149 kind = PyUnicode_KIND(unicode);
6150 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006151 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6152 bytes, and 1 byte characters 4. */
6153 expandsize = kind * 2 + 2;
6154 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) {
6155 return PyErr_NoMemory();
6156 }
6157 repr = PyBytes_FromStringAndSize(NULL, 2 + expandsize * len + 1);
6158 if (repr == NULL) {
6159 return NULL;
6160 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006161
Victor Stinner62ec3312016-09-06 17:04:34 -07006162 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006163 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006164 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006165
Victor Stinner62ec3312016-09-06 17:04:34 -07006166 /* U+0000-U+00ff range */
6167 if (ch < 0x100) {
6168 if (ch >= ' ' && ch < 127) {
6169 if (ch != '\\') {
6170 /* Copy printable US ASCII as-is */
6171 *p++ = (char) ch;
6172 }
6173 /* Escape backslashes */
6174 else {
6175 *p++ = '\\';
6176 *p++ = '\\';
6177 }
6178 }
Victor Stinner358af132015-10-12 22:36:57 +02006179
Victor Stinner62ec3312016-09-06 17:04:34 -07006180 /* Map special whitespace to '\t', \n', '\r' */
6181 else if (ch == '\t') {
6182 *p++ = '\\';
6183 *p++ = 't';
6184 }
6185 else if (ch == '\n') {
6186 *p++ = '\\';
6187 *p++ = 'n';
6188 }
6189 else if (ch == '\r') {
6190 *p++ = '\\';
6191 *p++ = 'r';
6192 }
6193
6194 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6195 else {
6196 *p++ = '\\';
6197 *p++ = 'x';
6198 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6199 *p++ = Py_hexdigits[ch & 0x000F];
6200 }
Tim Petersced69f82003-09-16 20:30:58 +00006201 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006202 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6203 else if (ch < 0x10000) {
6204 /* U+0100-U+ffff */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 *p++ = '\\';
6206 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006207 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6208 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6209 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6210 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006212 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6213 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006214
Victor Stinner62ec3312016-09-06 17:04:34 -07006215 /* Make sure that the first two digits are zero */
6216 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006217 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006218 *p++ = 'U';
6219 *p++ = '0';
6220 *p++ = '0';
6221 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6222 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6223 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6224 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6225 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6226 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006227 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229
Victor Stinner62ec3312016-09-06 17:04:34 -07006230 assert(p - PyBytes_AS_STRING(repr) > 0);
6231 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6232 return NULL;
6233 }
6234 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235}
6236
Alexander Belopolsky40018472011-02-26 01:02:56 +00006237PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006238PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6239 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006241 PyObject *result;
6242 PyObject *tmp = PyUnicode_FromUnicode(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006243 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006245 }
6246
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006247 result = PyUnicode_AsUnicodeEscapeString(tmp);
6248 Py_DECREF(tmp);
6249 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250}
6251
6252/* --- Raw Unicode Escape Codec ------------------------------------------- */
6253
Alexander Belopolsky40018472011-02-26 01:02:56 +00006254PyObject *
6255PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006256 Py_ssize_t size,
6257 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006259 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006260 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006262 PyObject *errorHandler = NULL;
6263 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006264
Victor Stinner62ec3312016-09-06 17:04:34 -07006265 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006266 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006267 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006268
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269 /* Escaped strings will always be longer than the resulting
6270 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006271 length after conversion to the true value. (But decoding error
6272 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006273 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006274 writer.min_length = size;
6275 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6276 goto onError;
6277 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006278
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 end = s + size;
6280 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006281 unsigned char c = (unsigned char) *s++;
6282 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006283 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006284 Py_ssize_t startinpos;
6285 Py_ssize_t endinpos;
6286 const char *message;
6287
6288#define WRITE_CHAR(ch) \
6289 do { \
6290 if (ch <= writer.maxchar) { \
6291 assert(writer.pos < writer.size); \
6292 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6293 } \
6294 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6295 goto onError; \
6296 } \
6297 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006300 if (c != '\\' || s >= end) {
6301 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006303 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006304
Victor Stinner62ec3312016-09-06 17:04:34 -07006305 c = (unsigned char) *s++;
6306 if (c == 'u') {
6307 count = 4;
6308 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006310 else if (c == 'U') {
6311 count = 8;
6312 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006313 }
6314 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006315 assert(writer.pos < writer.size);
6316 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6317 WRITE_CHAR(c);
6318 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006319 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006320 startinpos = s - starts - 2;
6321
6322 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6323 for (ch = 0; count && s < end; ++s, --count) {
6324 c = (unsigned char)*s;
6325 ch <<= 4;
6326 if (c >= '0' && c <= '9') {
6327 ch += c - '0';
6328 }
6329 else if (c >= 'a' && c <= 'f') {
6330 ch += c - ('a' - 10);
6331 }
6332 else if (c >= 'A' && c <= 'F') {
6333 ch += c - ('A' - 10);
6334 }
6335 else {
6336 break;
6337 }
6338 }
6339 if (!count) {
6340 if (ch <= MAX_UNICODE) {
6341 WRITE_CHAR(ch);
6342 continue;
6343 }
6344 message = "\\Uxxxxxxxx out of range";
6345 }
6346
6347 endinpos = s-starts;
6348 writer.min_length = end - s + writer.pos;
6349 if (unicode_decode_call_errorhandler_writer(
6350 errors, &errorHandler,
6351 "rawunicodeescape", message,
6352 &starts, &end, &startinpos, &endinpos, &exc, &s,
6353 &writer)) {
6354 goto onError;
6355 }
6356 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6357 goto onError;
6358 }
6359
6360#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006362 Py_XDECREF(errorHandler);
6363 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006364 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006365
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006367 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006368 Py_XDECREF(errorHandler);
6369 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006371
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372}
6373
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006374
Alexander Belopolsky40018472011-02-26 01:02:56 +00006375PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006376PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377{
Victor Stinner62ec3312016-09-06 17:04:34 -07006378 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006380 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006381 int kind;
6382 void *data;
6383 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006385 if (!PyUnicode_Check(unicode)) {
6386 PyErr_BadArgument();
6387 return NULL;
6388 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006389 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006390 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006391 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006392 kind = PyUnicode_KIND(unicode);
6393 data = PyUnicode_DATA(unicode);
6394 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006395 if (kind == PyUnicode_1BYTE_KIND) {
6396 return PyBytes_FromStringAndSize(data, len);
6397 }
Victor Stinner0e368262011-11-10 20:12:49 +01006398
Victor Stinner62ec3312016-09-06 17:04:34 -07006399 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6400 bytes, and 1 byte characters 4. */
6401 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006402
Victor Stinner62ec3312016-09-06 17:04:34 -07006403 if (len > PY_SSIZE_T_MAX / expandsize) {
6404 return PyErr_NoMemory();
6405 }
6406 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6407 if (repr == NULL) {
6408 return NULL;
6409 }
6410 if (len == 0) {
6411 return repr;
6412 }
6413
6414 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006415 for (pos = 0; pos < len; pos++) {
6416 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006417
Victor Stinner62ec3312016-09-06 17:04:34 -07006418 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6419 if (ch < 0x100) {
6420 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006421 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006422 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6423 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 *p++ = '\\';
6425 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006426 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6427 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6428 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6429 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006431 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6432 else {
6433 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6434 *p++ = '\\';
6435 *p++ = 'U';
6436 *p++ = '0';
6437 *p++ = '0';
6438 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6439 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6440 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6441 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6442 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6443 *p++ = Py_hexdigits[ch & 15];
6444 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006446
Victor Stinner62ec3312016-09-06 17:04:34 -07006447 assert(p > PyBytes_AS_STRING(repr));
6448 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6449 return NULL;
6450 }
6451 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452}
6453
Alexander Belopolsky40018472011-02-26 01:02:56 +00006454PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006455PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6456 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006458 PyObject *result;
6459 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6460 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006461 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006462 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6463 Py_DECREF(tmp);
6464 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465}
6466
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006467/* --- Unicode Internal Codec ------------------------------------------- */
6468
Alexander Belopolsky40018472011-02-26 01:02:56 +00006469PyObject *
6470_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006471 Py_ssize_t size,
6472 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006473{
6474 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006475 Py_ssize_t startinpos;
6476 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006477 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006478 const char *end;
6479 const char *reason;
6480 PyObject *errorHandler = NULL;
6481 PyObject *exc = NULL;
6482
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006483 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006484 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006485 1))
6486 return NULL;
6487
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006488 if (size == 0)
6489 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006490
Victor Stinner8f674cc2013-04-17 23:02:17 +02006491 _PyUnicodeWriter_Init(&writer);
6492 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6493 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006495 }
6496 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006497
Victor Stinner8f674cc2013-04-17 23:02:17 +02006498 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006499 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006500 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006501 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006502 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006503 endinpos = end-starts;
6504 reason = "truncated input";
6505 goto error;
6506 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006507 /* We copy the raw representation one byte at a time because the
6508 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006509 ((char *) &uch)[0] = s[0];
6510 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006511#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006512 ((char *) &uch)[2] = s[2];
6513 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006514#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006515 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006516#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006517 /* We have to sanity check the raw data, otherwise doom looms for
6518 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006519 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006520 endinpos = s - starts + Py_UNICODE_SIZE;
6521 reason = "illegal code point (> 0x10FFFF)";
6522 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006523 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006524#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006525 s += Py_UNICODE_SIZE;
6526#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006527 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006528 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006529 Py_UNICODE uch2;
6530 ((char *) &uch2)[0] = s[0];
6531 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006532 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006533 {
Victor Stinner551ac952011-11-29 22:58:13 +01006534 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006535 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006536 }
6537 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006538#endif
6539
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006540 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006541 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006542 continue;
6543
6544 error:
6545 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006546 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006547 errors, &errorHandler,
6548 "unicode_internal", reason,
6549 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006550 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006551 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006552 }
6553
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006554 Py_XDECREF(errorHandler);
6555 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006556 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006557
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006559 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006560 Py_XDECREF(errorHandler);
6561 Py_XDECREF(exc);
6562 return NULL;
6563}
6564
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565/* --- Latin-1 Codec ------------------------------------------------------ */
6566
Alexander Belopolsky40018472011-02-26 01:02:56 +00006567PyObject *
6568PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006569 Py_ssize_t size,
6570 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006573 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574}
6575
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006576/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006577static void
6578make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006579 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006580 PyObject *unicode,
6581 Py_ssize_t startpos, Py_ssize_t endpos,
6582 const char *reason)
6583{
6584 if (*exceptionObject == NULL) {
6585 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006586 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006587 encoding, unicode, startpos, endpos, reason);
6588 }
6589 else {
6590 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6591 goto onError;
6592 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6593 goto onError;
6594 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6595 goto onError;
6596 return;
6597 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006598 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006599 }
6600}
6601
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006602/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006603static void
6604raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006605 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006606 PyObject *unicode,
6607 Py_ssize_t startpos, Py_ssize_t endpos,
6608 const char *reason)
6609{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006610 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006611 encoding, unicode, startpos, endpos, reason);
6612 if (*exceptionObject != NULL)
6613 PyCodec_StrictErrors(*exceptionObject);
6614}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006615
6616/* error handling callback helper:
6617 build arguments, call the callback and check the arguments,
6618 put the result into newpos and return the replacement string, which
6619 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006620static PyObject *
6621unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006622 PyObject **errorHandler,
6623 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006624 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006625 Py_ssize_t startpos, Py_ssize_t endpos,
6626 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006627{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006628 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006629 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006630 PyObject *restuple;
6631 PyObject *resunicode;
6632
6633 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006635 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006636 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006637 }
6638
Benjamin Petersonbac79492012-01-14 13:34:47 -05006639 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006640 return NULL;
6641 len = PyUnicode_GET_LENGTH(unicode);
6642
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006643 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006644 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006645 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006646 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006647
6648 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006650 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006651 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006652 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006653 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 Py_DECREF(restuple);
6655 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006656 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006657 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 &resunicode, newpos)) {
6659 Py_DECREF(restuple);
6660 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006661 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006662 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6663 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6664 Py_DECREF(restuple);
6665 return NULL;
6666 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006667 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006668 *newpos = len + *newpos;
6669 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006670 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 Py_DECREF(restuple);
6672 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006673 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006674 Py_INCREF(resunicode);
6675 Py_DECREF(restuple);
6676 return resunicode;
6677}
6678
Alexander Belopolsky40018472011-02-26 01:02:56 +00006679static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006680unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006681 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006682 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006683{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006684 /* input state */
6685 Py_ssize_t pos=0, size;
6686 int kind;
6687 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006688 /* pointer into the output */
6689 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006690 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6691 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006692 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006693 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006694 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006695 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006696 /* output object */
6697 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006698
Benjamin Petersonbac79492012-01-14 13:34:47 -05006699 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006700 return NULL;
6701 size = PyUnicode_GET_LENGTH(unicode);
6702 kind = PyUnicode_KIND(unicode);
6703 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006704 /* allocate enough for a simple encoding without
6705 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006706 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006707 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006708
6709 _PyBytesWriter_Init(&writer);
6710 str = _PyBytesWriter_Alloc(&writer, size);
6711 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006712 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006714 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006715 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006716
Benjamin Peterson29060642009-01-31 22:14:21 +00006717 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006718 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006720 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006721 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006722 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006723 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006724 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006726 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006727 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006729
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006730 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006732
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006733 /* Only overallocate the buffer if it's not the last write */
6734 writer.overallocate = (collend < size);
6735
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006737 if (error_handler == _Py_ERROR_UNKNOWN)
6738 error_handler = get_error_handler(errors);
6739
6740 switch (error_handler) {
6741 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006742 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006744
6745 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006746 memset(str, '?', collend - collstart);
6747 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006748 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006749 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006750 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 break;
Victor Stinner50149202015-09-22 00:26:54 +02006752
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006753 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006754 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006755 writer.min_size -= (collend - collstart);
6756 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006757 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006758 if (str == NULL)
6759 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006760 pos = collend;
6761 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006762
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006763 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006764 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006765 writer.min_size -= (collend - collstart);
6766 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006767 unicode, collstart, collend);
6768 if (str == NULL)
6769 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006770 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 break;
Victor Stinner50149202015-09-22 00:26:54 +02006772
Victor Stinnerc3713e92015-09-29 12:32:13 +02006773 case _Py_ERROR_SURROGATEESCAPE:
6774 for (i = collstart; i < collend; ++i) {
6775 ch = PyUnicode_READ(kind, data, i);
6776 if (ch < 0xdc80 || 0xdcff < ch) {
6777 /* Not a UTF-8b surrogate */
6778 break;
6779 }
6780 *str++ = (char)(ch - 0xdc00);
6781 ++pos;
6782 }
6783 if (i >= collend)
6784 break;
6785 collstart = pos;
6786 assert(collstart != collend);
6787 /* fallback to general error handling */
6788
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006790 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6791 encoding, reason, unicode, &exc,
6792 collstart, collend, &newpos);
6793 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006794 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006795
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006796 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006797 writer.min_size -= 1;
6798
Victor Stinner6bd525b2015-10-09 13:10:05 +02006799 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006800 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006801 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006802 PyBytes_AS_STRING(rep),
6803 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006804 if (str == NULL)
6805 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006806 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006807 else {
6808 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006809
Victor Stinner6bd525b2015-10-09 13:10:05 +02006810 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006812
6813 if (PyUnicode_IS_ASCII(rep)) {
6814 /* Fast path: all characters are smaller than limit */
6815 assert(limit >= 128);
6816 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6817 str = _PyBytesWriter_WriteBytes(&writer, str,
6818 PyUnicode_DATA(rep),
6819 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006821 else {
6822 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6823
6824 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6825 if (str == NULL)
6826 goto onError;
6827
6828 /* check if there is anything unencodable in the
6829 replacement and copy it to the output */
6830 for (i = 0; repsize-->0; ++i, ++str) {
6831 ch = PyUnicode_READ_CHAR(rep, i);
6832 if (ch >= limit) {
6833 raise_encode_exception(&exc, encoding, unicode,
6834 pos, pos+1, reason);
6835 goto onError;
6836 }
6837 *str = (char)ch;
6838 }
6839 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006840 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006841 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006842 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006843 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006844
6845 /* If overallocation was disabled, ensure that it was the last
6846 write. Otherwise, we missed an optimization */
6847 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006848 }
6849 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006850
Victor Stinner50149202015-09-22 00:26:54 +02006851 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006852 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006853 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006854
6855 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006856 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006857 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006858 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006859 Py_XDECREF(exc);
6860 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006861}
6862
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006863/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006864PyObject *
6865PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006866 Py_ssize_t size,
6867 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006869 PyObject *result;
6870 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6871 if (unicode == NULL)
6872 return NULL;
6873 result = unicode_encode_ucs1(unicode, errors, 256);
6874 Py_DECREF(unicode);
6875 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876}
6877
Alexander Belopolsky40018472011-02-26 01:02:56 +00006878PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006879_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880{
6881 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006882 PyErr_BadArgument();
6883 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006885 if (PyUnicode_READY(unicode) == -1)
6886 return NULL;
6887 /* Fast path: if it is a one-byte string, construct
6888 bytes object directly. */
6889 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6890 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6891 PyUnicode_GET_LENGTH(unicode));
6892 /* Non-Latin-1 characters present. Defer to above function to
6893 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006894 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006895}
6896
6897PyObject*
6898PyUnicode_AsLatin1String(PyObject *unicode)
6899{
6900 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901}
6902
6903/* --- 7-bit ASCII Codec -------------------------------------------------- */
6904
Alexander Belopolsky40018472011-02-26 01:02:56 +00006905PyObject *
6906PyUnicode_DecodeASCII(const char *s,
6907 Py_ssize_t size,
6908 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006910 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006911 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006912 int kind;
6913 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006914 Py_ssize_t startinpos;
6915 Py_ssize_t endinpos;
6916 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006917 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006918 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006919 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006920 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006921
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006923 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006924
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006926 if (size == 1 && (unsigned char)s[0] < 128)
6927 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006928
Victor Stinner8f674cc2013-04-17 23:02:17 +02006929 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006930 writer.min_length = size;
6931 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006932 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006933
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006934 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006935 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006936 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006937 writer.pos = outpos;
6938 if (writer.pos == size)
6939 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006940
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006941 s += writer.pos;
6942 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006943 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006944 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006946 PyUnicode_WRITE(kind, data, writer.pos, c);
6947 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006948 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006949 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006950 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006951
6952 /* byte outsize range 0x00..0x7f: call the error handler */
6953
6954 if (error_handler == _Py_ERROR_UNKNOWN)
6955 error_handler = get_error_handler(errors);
6956
6957 switch (error_handler)
6958 {
6959 case _Py_ERROR_REPLACE:
6960 case _Py_ERROR_SURROGATEESCAPE:
6961 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006962 but we may switch to UCS2 at the first write */
6963 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6964 goto onError;
6965 kind = writer.kind;
6966 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006967
6968 if (error_handler == _Py_ERROR_REPLACE)
6969 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6970 else
6971 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6972 writer.pos++;
6973 ++s;
6974 break;
6975
6976 case _Py_ERROR_IGNORE:
6977 ++s;
6978 break;
6979
6980 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006981 startinpos = s-starts;
6982 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006983 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006984 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 "ascii", "ordinal not in range(128)",
6986 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006987 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006989 kind = writer.kind;
6990 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006993 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006994 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006995 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006996
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006998 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006999 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007000 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 return NULL;
7002}
7003
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007004/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007005PyObject *
7006PyUnicode_EncodeASCII(const Py_UNICODE *p,
7007 Py_ssize_t size,
7008 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007010 PyObject *result;
7011 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7012 if (unicode == NULL)
7013 return NULL;
7014 result = unicode_encode_ucs1(unicode, errors, 128);
7015 Py_DECREF(unicode);
7016 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017}
7018
Alexander Belopolsky40018472011-02-26 01:02:56 +00007019PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007020_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021{
7022 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 PyErr_BadArgument();
7024 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007026 if (PyUnicode_READY(unicode) == -1)
7027 return NULL;
7028 /* Fast path: if it is an ASCII-only string, construct bytes object
7029 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007030 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007031 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7032 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007033 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007034}
7035
7036PyObject *
7037PyUnicode_AsASCIIString(PyObject *unicode)
7038{
7039 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040}
7041
Steve Dowercc16be82016-09-08 10:35:16 -07007042#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007044/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007045
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007046#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007047#define NEED_RETRY
7048#endif
7049
Victor Stinner3a50e702011-10-18 21:21:00 +02007050#ifndef WC_ERR_INVALID_CHARS
7051# define WC_ERR_INVALID_CHARS 0x0080
7052#endif
7053
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007054static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007055code_page_name(UINT code_page, PyObject **obj)
7056{
7057 *obj = NULL;
7058 if (code_page == CP_ACP)
7059 return "mbcs";
7060 if (code_page == CP_UTF7)
7061 return "CP_UTF7";
7062 if (code_page == CP_UTF8)
7063 return "CP_UTF8";
7064
7065 *obj = PyBytes_FromFormat("cp%u", code_page);
7066 if (*obj == NULL)
7067 return NULL;
7068 return PyBytes_AS_STRING(*obj);
7069}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007070
Victor Stinner3a50e702011-10-18 21:21:00 +02007071static DWORD
7072decode_code_page_flags(UINT code_page)
7073{
7074 if (code_page == CP_UTF7) {
7075 /* The CP_UTF7 decoder only supports flags=0 */
7076 return 0;
7077 }
7078 else
7079 return MB_ERR_INVALID_CHARS;
7080}
7081
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007082/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007083 * Decode a byte string from a Windows code page into unicode object in strict
7084 * mode.
7085 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007086 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7087 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007089static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007090decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007091 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007092 const char *in,
7093 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007094{
Victor Stinner3a50e702011-10-18 21:21:00 +02007095 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007096 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007097 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007098
7099 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007100 assert(insize > 0);
7101 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7102 if (outsize <= 0)
7103 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007104
7105 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007106 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007107 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007108 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007109 if (*v == NULL)
7110 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007111 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007112 }
7113 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007114 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007115 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007116 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007117 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007118 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007119 }
7120
7121 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007122 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7123 if (outsize <= 0)
7124 goto error;
7125 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007126
Victor Stinner3a50e702011-10-18 21:21:00 +02007127error:
7128 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7129 return -2;
7130 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007131 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007132}
7133
Victor Stinner3a50e702011-10-18 21:21:00 +02007134/*
7135 * Decode a byte string from a code page into unicode object with an error
7136 * handler.
7137 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007138 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 * UnicodeDecodeError exception and returns -1 on error.
7140 */
7141static int
7142decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007143 PyObject **v,
7144 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007145 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007146{
7147 const char *startin = in;
7148 const char *endin = in + size;
7149 const DWORD flags = decode_code_page_flags(code_page);
7150 /* Ideally, we should get reason from FormatMessage. This is the Windows
7151 2000 English version of the message. */
7152 const char *reason = "No mapping for the Unicode character exists "
7153 "in the target code page.";
7154 /* each step cannot decode more than 1 character, but a character can be
7155 represented as a surrogate pair */
7156 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007157 int insize;
7158 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007159 PyObject *errorHandler = NULL;
7160 PyObject *exc = NULL;
7161 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007162 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007163 DWORD err;
7164 int ret = -1;
7165
7166 assert(size > 0);
7167
7168 encoding = code_page_name(code_page, &encoding_obj);
7169 if (encoding == NULL)
7170 return -1;
7171
Victor Stinner7d00cc12014-03-17 23:08:06 +01007172 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007173 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7174 UnicodeDecodeError. */
7175 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7176 if (exc != NULL) {
7177 PyCodec_StrictErrors(exc);
7178 Py_CLEAR(exc);
7179 }
7180 goto error;
7181 }
7182
7183 if (*v == NULL) {
7184 /* Create unicode object */
7185 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7186 PyErr_NoMemory();
7187 goto error;
7188 }
Victor Stinnerab595942011-12-17 04:59:06 +01007189 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007190 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007191 if (*v == NULL)
7192 goto error;
7193 startout = PyUnicode_AS_UNICODE(*v);
7194 }
7195 else {
7196 /* Extend unicode object */
7197 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7198 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7199 PyErr_NoMemory();
7200 goto error;
7201 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007202 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007203 goto error;
7204 startout = PyUnicode_AS_UNICODE(*v) + n;
7205 }
7206
7207 /* Decode the byte string character per character */
7208 out = startout;
7209 while (in < endin)
7210 {
7211 /* Decode a character */
7212 insize = 1;
7213 do
7214 {
7215 outsize = MultiByteToWideChar(code_page, flags,
7216 in, insize,
7217 buffer, Py_ARRAY_LENGTH(buffer));
7218 if (outsize > 0)
7219 break;
7220 err = GetLastError();
7221 if (err != ERROR_NO_UNICODE_TRANSLATION
7222 && err != ERROR_INSUFFICIENT_BUFFER)
7223 {
7224 PyErr_SetFromWindowsErr(0);
7225 goto error;
7226 }
7227 insize++;
7228 }
7229 /* 4=maximum length of a UTF-8 sequence */
7230 while (insize <= 4 && (in + insize) <= endin);
7231
7232 if (outsize <= 0) {
7233 Py_ssize_t startinpos, endinpos, outpos;
7234
Victor Stinner7d00cc12014-03-17 23:08:06 +01007235 /* last character in partial decode? */
7236 if (in + insize >= endin && !final)
7237 break;
7238
Victor Stinner3a50e702011-10-18 21:21:00 +02007239 startinpos = in - startin;
7240 endinpos = startinpos + 1;
7241 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007242 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007243 errors, &errorHandler,
7244 encoding, reason,
7245 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007246 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007247 {
7248 goto error;
7249 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007250 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007251 }
7252 else {
7253 in += insize;
7254 memcpy(out, buffer, outsize * sizeof(wchar_t));
7255 out += outsize;
7256 }
7257 }
7258
7259 /* write a NUL character at the end */
7260 *out = 0;
7261
7262 /* Extend unicode object */
7263 outsize = out - startout;
7264 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007265 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007266 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007267 /* (in - startin) <= size and size is an int */
7268 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007269
7270error:
7271 Py_XDECREF(encoding_obj);
7272 Py_XDECREF(errorHandler);
7273 Py_XDECREF(exc);
7274 return ret;
7275}
7276
Victor Stinner3a50e702011-10-18 21:21:00 +02007277static PyObject *
7278decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007279 const char *s, Py_ssize_t size,
7280 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007281{
Victor Stinner76a31a62011-11-04 00:05:13 +01007282 PyObject *v = NULL;
7283 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007284
Victor Stinner3a50e702011-10-18 21:21:00 +02007285 if (code_page < 0) {
7286 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7287 return NULL;
7288 }
7289
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007290 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007292
Victor Stinner76a31a62011-11-04 00:05:13 +01007293 do
7294 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007295#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007296 if (size > INT_MAX) {
7297 chunk_size = INT_MAX;
7298 final = 0;
7299 done = 0;
7300 }
7301 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007302#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007303 {
7304 chunk_size = (int)size;
7305 final = (consumed == NULL);
7306 done = 1;
7307 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007308
Victor Stinner76a31a62011-11-04 00:05:13 +01007309 if (chunk_size == 0 && done) {
7310 if (v != NULL)
7311 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007312 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007313 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007314
Victor Stinner76a31a62011-11-04 00:05:13 +01007315 converted = decode_code_page_strict(code_page, &v,
7316 s, chunk_size);
7317 if (converted == -2)
7318 converted = decode_code_page_errors(code_page, &v,
7319 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007320 errors, final);
7321 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007322
7323 if (converted < 0) {
7324 Py_XDECREF(v);
7325 return NULL;
7326 }
7327
7328 if (consumed)
7329 *consumed += converted;
7330
7331 s += converted;
7332 size -= converted;
7333 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007334
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007335 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007336}
7337
Alexander Belopolsky40018472011-02-26 01:02:56 +00007338PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007339PyUnicode_DecodeCodePageStateful(int code_page,
7340 const char *s,
7341 Py_ssize_t size,
7342 const char *errors,
7343 Py_ssize_t *consumed)
7344{
7345 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7346}
7347
7348PyObject *
7349PyUnicode_DecodeMBCSStateful(const char *s,
7350 Py_ssize_t size,
7351 const char *errors,
7352 Py_ssize_t *consumed)
7353{
7354 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7355}
7356
7357PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007358PyUnicode_DecodeMBCS(const char *s,
7359 Py_ssize_t size,
7360 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007361{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007362 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7363}
7364
Victor Stinner3a50e702011-10-18 21:21:00 +02007365static DWORD
7366encode_code_page_flags(UINT code_page, const char *errors)
7367{
7368 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007369 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007370 }
7371 else if (code_page == CP_UTF7) {
7372 /* CP_UTF7 only supports flags=0 */
7373 return 0;
7374 }
7375 else {
7376 if (errors != NULL && strcmp(errors, "replace") == 0)
7377 return 0;
7378 else
7379 return WC_NO_BEST_FIT_CHARS;
7380 }
7381}
7382
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007383/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007384 * Encode a Unicode string to a Windows code page into a byte string in strict
7385 * mode.
7386 *
7387 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007388 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007389 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007390static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007391encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007392 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007393 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007394{
Victor Stinner554f3f02010-06-16 23:33:54 +00007395 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 BOOL *pusedDefaultChar = &usedDefaultChar;
7397 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007398 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007399 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007400 const DWORD flags = encode_code_page_flags(code_page, NULL);
7401 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007402 /* Create a substring so that we can get the UTF-16 representation
7403 of just the slice under consideration. */
7404 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007405
Martin v. Löwis3d325192011-11-04 18:23:06 +01007406 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007407
Victor Stinner3a50e702011-10-18 21:21:00 +02007408 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007409 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007410 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007411 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007412
Victor Stinner2fc507f2011-11-04 20:06:39 +01007413 substring = PyUnicode_Substring(unicode, offset, offset+len);
7414 if (substring == NULL)
7415 return -1;
7416 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7417 if (p == NULL) {
7418 Py_DECREF(substring);
7419 return -1;
7420 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007421 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007422
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007423 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007425 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 NULL, 0,
7427 NULL, pusedDefaultChar);
7428 if (outsize <= 0)
7429 goto error;
7430 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007431 if (pusedDefaultChar && *pusedDefaultChar) {
7432 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007434 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007435
Victor Stinner3a50e702011-10-18 21:21:00 +02007436 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007439 if (*outbytes == NULL) {
7440 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007442 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007444 }
7445 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007447 const Py_ssize_t n = PyBytes_Size(*outbytes);
7448 if (outsize > PY_SSIZE_T_MAX - n) {
7449 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007450 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007453 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7454 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007456 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007458 }
7459
7460 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007462 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007463 out, outsize,
7464 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007465 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 if (outsize <= 0)
7467 goto error;
7468 if (pusedDefaultChar && *pusedDefaultChar)
7469 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007470 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007471
Victor Stinner3a50e702011-10-18 21:21:00 +02007472error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007473 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007474 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7475 return -2;
7476 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007477 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007478}
7479
Victor Stinner3a50e702011-10-18 21:21:00 +02007480/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007481 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007482 * error handler.
7483 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007484 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 * -1 on other error.
7486 */
7487static int
7488encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007489 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007490 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007491{
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007493 Py_ssize_t pos = unicode_offset;
7494 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 /* Ideally, we should get reason from FormatMessage. This is the Windows
7496 2000 English version of the message. */
7497 const char *reason = "invalid character";
7498 /* 4=maximum length of a UTF-8 sequence */
7499 char buffer[4];
7500 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7501 Py_ssize_t outsize;
7502 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 PyObject *errorHandler = NULL;
7504 PyObject *exc = NULL;
7505 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007506 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007507 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007508 PyObject *rep;
7509 int ret = -1;
7510
7511 assert(insize > 0);
7512
7513 encoding = code_page_name(code_page, &encoding_obj);
7514 if (encoding == NULL)
7515 return -1;
7516
7517 if (errors == NULL || strcmp(errors, "strict") == 0) {
7518 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7519 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007520 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007521 if (exc != NULL) {
7522 PyCodec_StrictErrors(exc);
7523 Py_DECREF(exc);
7524 }
7525 Py_XDECREF(encoding_obj);
7526 return -1;
7527 }
7528
7529 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7530 pusedDefaultChar = &usedDefaultChar;
7531 else
7532 pusedDefaultChar = NULL;
7533
7534 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7535 PyErr_NoMemory();
7536 goto error;
7537 }
7538 outsize = insize * Py_ARRAY_LENGTH(buffer);
7539
7540 if (*outbytes == NULL) {
7541 /* Create string object */
7542 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7543 if (*outbytes == NULL)
7544 goto error;
7545 out = PyBytes_AS_STRING(*outbytes);
7546 }
7547 else {
7548 /* Extend string object */
7549 Py_ssize_t n = PyBytes_Size(*outbytes);
7550 if (n > PY_SSIZE_T_MAX - outsize) {
7551 PyErr_NoMemory();
7552 goto error;
7553 }
7554 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7555 goto error;
7556 out = PyBytes_AS_STRING(*outbytes) + n;
7557 }
7558
7559 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007560 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007561 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007562 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7563 wchar_t chars[2];
7564 int charsize;
7565 if (ch < 0x10000) {
7566 chars[0] = (wchar_t)ch;
7567 charsize = 1;
7568 }
7569 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007570 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7571 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007572 charsize = 2;
7573 }
7574
Victor Stinner3a50e702011-10-18 21:21:00 +02007575 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007576 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007577 buffer, Py_ARRAY_LENGTH(buffer),
7578 NULL, pusedDefaultChar);
7579 if (outsize > 0) {
7580 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7581 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007582 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007583 memcpy(out, buffer, outsize);
7584 out += outsize;
7585 continue;
7586 }
7587 }
7588 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7589 PyErr_SetFromWindowsErr(0);
7590 goto error;
7591 }
7592
Victor Stinner3a50e702011-10-18 21:21:00 +02007593 rep = unicode_encode_call_errorhandler(
7594 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007595 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007596 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007597 if (rep == NULL)
7598 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007599 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007600
7601 if (PyBytes_Check(rep)) {
7602 outsize = PyBytes_GET_SIZE(rep);
7603 if (outsize != 1) {
7604 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7605 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7606 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7607 Py_DECREF(rep);
7608 goto error;
7609 }
7610 out = PyBytes_AS_STRING(*outbytes) + offset;
7611 }
7612 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7613 out += outsize;
7614 }
7615 else {
7616 Py_ssize_t i;
7617 enum PyUnicode_Kind kind;
7618 void *data;
7619
Benjamin Petersonbac79492012-01-14 13:34:47 -05007620 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 Py_DECREF(rep);
7622 goto error;
7623 }
7624
7625 outsize = PyUnicode_GET_LENGTH(rep);
7626 if (outsize != 1) {
7627 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7628 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7629 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7630 Py_DECREF(rep);
7631 goto error;
7632 }
7633 out = PyBytes_AS_STRING(*outbytes) + offset;
7634 }
7635 kind = PyUnicode_KIND(rep);
7636 data = PyUnicode_DATA(rep);
7637 for (i=0; i < outsize; i++) {
7638 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7639 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007640 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007641 encoding, unicode,
7642 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007643 "unable to encode error handler result to ASCII");
7644 Py_DECREF(rep);
7645 goto error;
7646 }
7647 *out = (unsigned char)ch;
7648 out++;
7649 }
7650 }
7651 Py_DECREF(rep);
7652 }
7653 /* write a NUL byte */
7654 *out = 0;
7655 outsize = out - PyBytes_AS_STRING(*outbytes);
7656 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7657 if (_PyBytes_Resize(outbytes, outsize) < 0)
7658 goto error;
7659 ret = 0;
7660
7661error:
7662 Py_XDECREF(encoding_obj);
7663 Py_XDECREF(errorHandler);
7664 Py_XDECREF(exc);
7665 return ret;
7666}
7667
Victor Stinner3a50e702011-10-18 21:21:00 +02007668static PyObject *
7669encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007670 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007671 const char *errors)
7672{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007673 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007674 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007675 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007676 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007677
Victor Stinner29dacf22015-01-26 16:41:32 +01007678 if (!PyUnicode_Check(unicode)) {
7679 PyErr_BadArgument();
7680 return NULL;
7681 }
7682
Benjamin Petersonbac79492012-01-14 13:34:47 -05007683 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007684 return NULL;
7685 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007686
Victor Stinner3a50e702011-10-18 21:21:00 +02007687 if (code_page < 0) {
7688 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7689 return NULL;
7690 }
7691
Martin v. Löwis3d325192011-11-04 18:23:06 +01007692 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007693 return PyBytes_FromStringAndSize(NULL, 0);
7694
Victor Stinner7581cef2011-11-03 22:32:33 +01007695 offset = 0;
7696 do
7697 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007698#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007699 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007700 chunks. */
7701 if (len > INT_MAX/2) {
7702 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007703 done = 0;
7704 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007705 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007706#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007707 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007708 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007709 done = 1;
7710 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007711
Victor Stinner76a31a62011-11-04 00:05:13 +01007712 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007713 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007714 errors);
7715 if (ret == -2)
7716 ret = encode_code_page_errors(code_page, &outbytes,
7717 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007718 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007719 if (ret < 0) {
7720 Py_XDECREF(outbytes);
7721 return NULL;
7722 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007723
Victor Stinner7581cef2011-11-03 22:32:33 +01007724 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007725 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007726 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007727
Victor Stinner3a50e702011-10-18 21:21:00 +02007728 return outbytes;
7729}
7730
7731PyObject *
7732PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7733 Py_ssize_t size,
7734 const char *errors)
7735{
Victor Stinner7581cef2011-11-03 22:32:33 +01007736 PyObject *unicode, *res;
7737 unicode = PyUnicode_FromUnicode(p, size);
7738 if (unicode == NULL)
7739 return NULL;
7740 res = encode_code_page(CP_ACP, unicode, errors);
7741 Py_DECREF(unicode);
7742 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007743}
7744
7745PyObject *
7746PyUnicode_EncodeCodePage(int code_page,
7747 PyObject *unicode,
7748 const char *errors)
7749{
Victor Stinner7581cef2011-11-03 22:32:33 +01007750 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007751}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007752
Alexander Belopolsky40018472011-02-26 01:02:56 +00007753PyObject *
7754PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007755{
Victor Stinner7581cef2011-11-03 22:32:33 +01007756 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007757}
7758
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007759#undef NEED_RETRY
7760
Steve Dowercc16be82016-09-08 10:35:16 -07007761#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007762
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763/* --- Character Mapping Codec -------------------------------------------- */
7764
Victor Stinnerfb161b12013-04-18 01:44:27 +02007765static int
7766charmap_decode_string(const char *s,
7767 Py_ssize_t size,
7768 PyObject *mapping,
7769 const char *errors,
7770 _PyUnicodeWriter *writer)
7771{
7772 const char *starts = s;
7773 const char *e;
7774 Py_ssize_t startinpos, endinpos;
7775 PyObject *errorHandler = NULL, *exc = NULL;
7776 Py_ssize_t maplen;
7777 enum PyUnicode_Kind mapkind;
7778 void *mapdata;
7779 Py_UCS4 x;
7780 unsigned char ch;
7781
7782 if (PyUnicode_READY(mapping) == -1)
7783 return -1;
7784
7785 maplen = PyUnicode_GET_LENGTH(mapping);
7786 mapdata = PyUnicode_DATA(mapping);
7787 mapkind = PyUnicode_KIND(mapping);
7788
7789 e = s + size;
7790
7791 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7792 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7793 * is disabled in encoding aliases, latin1 is preferred because
7794 * its implementation is faster. */
7795 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7796 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7797 Py_UCS4 maxchar = writer->maxchar;
7798
7799 assert (writer->kind == PyUnicode_1BYTE_KIND);
7800 while (s < e) {
7801 ch = *s;
7802 x = mapdata_ucs1[ch];
7803 if (x > maxchar) {
7804 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7805 goto onError;
7806 maxchar = writer->maxchar;
7807 outdata = (Py_UCS1 *)writer->data;
7808 }
7809 outdata[writer->pos] = x;
7810 writer->pos++;
7811 ++s;
7812 }
7813 return 0;
7814 }
7815
7816 while (s < e) {
7817 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7818 enum PyUnicode_Kind outkind = writer->kind;
7819 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7820 if (outkind == PyUnicode_1BYTE_KIND) {
7821 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7822 Py_UCS4 maxchar = writer->maxchar;
7823 while (s < e) {
7824 ch = *s;
7825 x = mapdata_ucs2[ch];
7826 if (x > maxchar)
7827 goto Error;
7828 outdata[writer->pos] = x;
7829 writer->pos++;
7830 ++s;
7831 }
7832 break;
7833 }
7834 else if (outkind == PyUnicode_2BYTE_KIND) {
7835 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7836 while (s < e) {
7837 ch = *s;
7838 x = mapdata_ucs2[ch];
7839 if (x == 0xFFFE)
7840 goto Error;
7841 outdata[writer->pos] = x;
7842 writer->pos++;
7843 ++s;
7844 }
7845 break;
7846 }
7847 }
7848 ch = *s;
7849
7850 if (ch < maplen)
7851 x = PyUnicode_READ(mapkind, mapdata, ch);
7852 else
7853 x = 0xfffe; /* invalid value */
7854Error:
7855 if (x == 0xfffe)
7856 {
7857 /* undefined mapping */
7858 startinpos = s-starts;
7859 endinpos = startinpos+1;
7860 if (unicode_decode_call_errorhandler_writer(
7861 errors, &errorHandler,
7862 "charmap", "character maps to <undefined>",
7863 &starts, &e, &startinpos, &endinpos, &exc, &s,
7864 writer)) {
7865 goto onError;
7866 }
7867 continue;
7868 }
7869
7870 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7871 goto onError;
7872 ++s;
7873 }
7874 Py_XDECREF(errorHandler);
7875 Py_XDECREF(exc);
7876 return 0;
7877
7878onError:
7879 Py_XDECREF(errorHandler);
7880 Py_XDECREF(exc);
7881 return -1;
7882}
7883
7884static int
7885charmap_decode_mapping(const char *s,
7886 Py_ssize_t size,
7887 PyObject *mapping,
7888 const char *errors,
7889 _PyUnicodeWriter *writer)
7890{
7891 const char *starts = s;
7892 const char *e;
7893 Py_ssize_t startinpos, endinpos;
7894 PyObject *errorHandler = NULL, *exc = NULL;
7895 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007896 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007897
7898 e = s + size;
7899
7900 while (s < e) {
7901 ch = *s;
7902
7903 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7904 key = PyLong_FromLong((long)ch);
7905 if (key == NULL)
7906 goto onError;
7907
7908 item = PyObject_GetItem(mapping, key);
7909 Py_DECREF(key);
7910 if (item == NULL) {
7911 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7912 /* No mapping found means: mapping is undefined. */
7913 PyErr_Clear();
7914 goto Undefined;
7915 } else
7916 goto onError;
7917 }
7918
7919 /* Apply mapping */
7920 if (item == Py_None)
7921 goto Undefined;
7922 if (PyLong_Check(item)) {
7923 long value = PyLong_AS_LONG(item);
7924 if (value == 0xFFFE)
7925 goto Undefined;
7926 if (value < 0 || value > MAX_UNICODE) {
7927 PyErr_Format(PyExc_TypeError,
7928 "character mapping must be in range(0x%lx)",
7929 (unsigned long)MAX_UNICODE + 1);
7930 goto onError;
7931 }
7932
7933 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7934 goto onError;
7935 }
7936 else if (PyUnicode_Check(item)) {
7937 if (PyUnicode_READY(item) == -1)
7938 goto onError;
7939 if (PyUnicode_GET_LENGTH(item) == 1) {
7940 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7941 if (value == 0xFFFE)
7942 goto Undefined;
7943 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7944 goto onError;
7945 }
7946 else {
7947 writer->overallocate = 1;
7948 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7949 goto onError;
7950 }
7951 }
7952 else {
7953 /* wrong return value */
7954 PyErr_SetString(PyExc_TypeError,
7955 "character mapping must return integer, None or str");
7956 goto onError;
7957 }
7958 Py_CLEAR(item);
7959 ++s;
7960 continue;
7961
7962Undefined:
7963 /* undefined mapping */
7964 Py_CLEAR(item);
7965 startinpos = s-starts;
7966 endinpos = startinpos+1;
7967 if (unicode_decode_call_errorhandler_writer(
7968 errors, &errorHandler,
7969 "charmap", "character maps to <undefined>",
7970 &starts, &e, &startinpos, &endinpos, &exc, &s,
7971 writer)) {
7972 goto onError;
7973 }
7974 }
7975 Py_XDECREF(errorHandler);
7976 Py_XDECREF(exc);
7977 return 0;
7978
7979onError:
7980 Py_XDECREF(item);
7981 Py_XDECREF(errorHandler);
7982 Py_XDECREF(exc);
7983 return -1;
7984}
7985
Alexander Belopolsky40018472011-02-26 01:02:56 +00007986PyObject *
7987PyUnicode_DecodeCharmap(const char *s,
7988 Py_ssize_t size,
7989 PyObject *mapping,
7990 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007992 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007993
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 /* Default to Latin-1 */
7995 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007999 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008000 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008001 writer.min_length = size;
8002 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008004
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008005 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008006 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8007 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008008 }
8009 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008010 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8011 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008013 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008014
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008016 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 return NULL;
8018}
8019
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008020/* Charmap encoding: the lookup table */
8021
Alexander Belopolsky40018472011-02-26 01:02:56 +00008022struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 PyObject_HEAD
8024 unsigned char level1[32];
8025 int count2, count3;
8026 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008027};
8028
8029static PyObject*
8030encoding_map_size(PyObject *obj, PyObject* args)
8031{
8032 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008033 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008035}
8036
8037static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008038 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 PyDoc_STR("Return the size (in bytes) of this object") },
8040 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008041};
8042
8043static void
8044encoding_map_dealloc(PyObject* o)
8045{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008046 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008047}
8048
8049static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008050 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 "EncodingMap", /*tp_name*/
8052 sizeof(struct encoding_map), /*tp_basicsize*/
8053 0, /*tp_itemsize*/
8054 /* methods */
8055 encoding_map_dealloc, /*tp_dealloc*/
8056 0, /*tp_print*/
8057 0, /*tp_getattr*/
8058 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008059 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 0, /*tp_repr*/
8061 0, /*tp_as_number*/
8062 0, /*tp_as_sequence*/
8063 0, /*tp_as_mapping*/
8064 0, /*tp_hash*/
8065 0, /*tp_call*/
8066 0, /*tp_str*/
8067 0, /*tp_getattro*/
8068 0, /*tp_setattro*/
8069 0, /*tp_as_buffer*/
8070 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8071 0, /*tp_doc*/
8072 0, /*tp_traverse*/
8073 0, /*tp_clear*/
8074 0, /*tp_richcompare*/
8075 0, /*tp_weaklistoffset*/
8076 0, /*tp_iter*/
8077 0, /*tp_iternext*/
8078 encoding_map_methods, /*tp_methods*/
8079 0, /*tp_members*/
8080 0, /*tp_getset*/
8081 0, /*tp_base*/
8082 0, /*tp_dict*/
8083 0, /*tp_descr_get*/
8084 0, /*tp_descr_set*/
8085 0, /*tp_dictoffset*/
8086 0, /*tp_init*/
8087 0, /*tp_alloc*/
8088 0, /*tp_new*/
8089 0, /*tp_free*/
8090 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008091};
8092
8093PyObject*
8094PyUnicode_BuildEncodingMap(PyObject* string)
8095{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008096 PyObject *result;
8097 struct encoding_map *mresult;
8098 int i;
8099 int need_dict = 0;
8100 unsigned char level1[32];
8101 unsigned char level2[512];
8102 unsigned char *mlevel1, *mlevel2, *mlevel3;
8103 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008104 int kind;
8105 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008106 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008107 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008108
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008109 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008110 PyErr_BadArgument();
8111 return NULL;
8112 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008113 kind = PyUnicode_KIND(string);
8114 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008115 length = PyUnicode_GET_LENGTH(string);
8116 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008117 memset(level1, 0xFF, sizeof level1);
8118 memset(level2, 0xFF, sizeof level2);
8119
8120 /* If there isn't a one-to-one mapping of NULL to \0,
8121 or if there are non-BMP characters, we need to use
8122 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008123 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008124 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008125 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008126 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008127 ch = PyUnicode_READ(kind, data, i);
8128 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008129 need_dict = 1;
8130 break;
8131 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008133 /* unmapped character */
8134 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008135 l1 = ch >> 11;
8136 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008137 if (level1[l1] == 0xFF)
8138 level1[l1] = count2++;
8139 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008140 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141 }
8142
8143 if (count2 >= 0xFF || count3 >= 0xFF)
8144 need_dict = 1;
8145
8146 if (need_dict) {
8147 PyObject *result = PyDict_New();
8148 PyObject *key, *value;
8149 if (!result)
8150 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008151 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008152 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008153 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008154 if (!key || !value)
8155 goto failed1;
8156 if (PyDict_SetItem(result, key, value) == -1)
8157 goto failed1;
8158 Py_DECREF(key);
8159 Py_DECREF(value);
8160 }
8161 return result;
8162 failed1:
8163 Py_XDECREF(key);
8164 Py_XDECREF(value);
8165 Py_DECREF(result);
8166 return NULL;
8167 }
8168
8169 /* Create a three-level trie */
8170 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8171 16*count2 + 128*count3 - 1);
8172 if (!result)
8173 return PyErr_NoMemory();
8174 PyObject_Init(result, &EncodingMapType);
8175 mresult = (struct encoding_map*)result;
8176 mresult->count2 = count2;
8177 mresult->count3 = count3;
8178 mlevel1 = mresult->level1;
8179 mlevel2 = mresult->level23;
8180 mlevel3 = mresult->level23 + 16*count2;
8181 memcpy(mlevel1, level1, 32);
8182 memset(mlevel2, 0xFF, 16*count2);
8183 memset(mlevel3, 0, 128*count3);
8184 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008185 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008186 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008187 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8188 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008189 /* unmapped character */
8190 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008191 o1 = ch>>11;
8192 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008193 i2 = 16*mlevel1[o1] + o2;
8194 if (mlevel2[i2] == 0xFF)
8195 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008196 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008197 i3 = 128*mlevel2[i2] + o3;
8198 mlevel3[i3] = i;
8199 }
8200 return result;
8201}
8202
8203static int
Victor Stinner22168992011-11-20 17:09:18 +01008204encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008205{
8206 struct encoding_map *map = (struct encoding_map*)mapping;
8207 int l1 = c>>11;
8208 int l2 = (c>>7) & 0xF;
8209 int l3 = c & 0x7F;
8210 int i;
8211
Victor Stinner22168992011-11-20 17:09:18 +01008212 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008213 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008214 if (c == 0)
8215 return 0;
8216 /* level 1*/
8217 i = map->level1[l1];
8218 if (i == 0xFF) {
8219 return -1;
8220 }
8221 /* level 2*/
8222 i = map->level23[16*i+l2];
8223 if (i == 0xFF) {
8224 return -1;
8225 }
8226 /* level 3 */
8227 i = map->level23[16*map->count2 + 128*i + l3];
8228 if (i == 0) {
8229 return -1;
8230 }
8231 return i;
8232}
8233
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008234/* Lookup the character ch in the mapping. If the character
8235 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008236 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008237static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008238charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239{
Christian Heimes217cfd12007-12-02 14:31:20 +00008240 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008241 PyObject *x;
8242
8243 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008245 x = PyObject_GetItem(mapping, w);
8246 Py_DECREF(w);
8247 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8249 /* No mapping found means: mapping is undefined. */
8250 PyErr_Clear();
8251 x = Py_None;
8252 Py_INCREF(x);
8253 return x;
8254 } else
8255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008257 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008259 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 long value = PyLong_AS_LONG(x);
8261 if (value < 0 || value > 255) {
8262 PyErr_SetString(PyExc_TypeError,
8263 "character mapping must be in range(256)");
8264 Py_DECREF(x);
8265 return NULL;
8266 }
8267 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008269 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 /* wrong return value */
8273 PyErr_Format(PyExc_TypeError,
8274 "character mapping must return integer, bytes or None, not %.400s",
8275 x->ob_type->tp_name);
8276 Py_DECREF(x);
8277 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 }
8279}
8280
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008281static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008282charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008283{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008284 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8285 /* exponentially overallocate to minimize reallocations */
8286 if (requiredsize < 2*outsize)
8287 requiredsize = 2*outsize;
8288 if (_PyBytes_Resize(outobj, requiredsize))
8289 return -1;
8290 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008291}
8292
Benjamin Peterson14339b62009-01-31 16:36:08 +00008293typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008295} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008296/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008297 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008298 space is available. Return a new reference to the object that
8299 was put in the output buffer, or Py_None, if the mapping was undefined
8300 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008301 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008302static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008303charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008304 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008306 PyObject *rep;
8307 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008308 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008309
Christian Heimes90aa7642007-12-19 02:45:37 +00008310 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008311 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008313 if (res == -1)
8314 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 if (outsize<requiredsize)
8316 if (charmapencode_resize(outobj, outpos, requiredsize))
8317 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008318 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 outstart[(*outpos)++] = (char)res;
8320 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008321 }
8322
8323 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008326 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 Py_DECREF(rep);
8328 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008329 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008330 if (PyLong_Check(rep)) {
8331 Py_ssize_t requiredsize = *outpos+1;
8332 if (outsize<requiredsize)
8333 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8334 Py_DECREF(rep);
8335 return enc_EXCEPTION;
8336 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008337 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008339 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 else {
8341 const char *repchars = PyBytes_AS_STRING(rep);
8342 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8343 Py_ssize_t requiredsize = *outpos+repsize;
8344 if (outsize<requiredsize)
8345 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8346 Py_DECREF(rep);
8347 return enc_EXCEPTION;
8348 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008349 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 memcpy(outstart + *outpos, repchars, repsize);
8351 *outpos += repsize;
8352 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008353 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008354 Py_DECREF(rep);
8355 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008356}
8357
8358/* handle an error in PyUnicode_EncodeCharmap
8359 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008360static int
8361charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008362 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008364 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008365 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366{
8367 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008368 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008369 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008370 enum PyUnicode_Kind kind;
8371 void *data;
8372 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008373 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008374 Py_ssize_t collstartpos = *inpos;
8375 Py_ssize_t collendpos = *inpos+1;
8376 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377 char *encoding = "charmap";
8378 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008379 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008380 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008381 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382
Benjamin Petersonbac79492012-01-14 13:34:47 -05008383 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008384 return -1;
8385 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386 /* find all unencodable characters */
8387 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008388 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008389 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008390 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008391 val = encoding_map_lookup(ch, mapping);
8392 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 break;
8394 ++collendpos;
8395 continue;
8396 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008397
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008398 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8399 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 if (rep==NULL)
8401 return -1;
8402 else if (rep!=Py_None) {
8403 Py_DECREF(rep);
8404 break;
8405 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008406 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 }
8409 /* cache callback name lookup
8410 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008411 if (*error_handler == _Py_ERROR_UNKNOWN)
8412 *error_handler = get_error_handler(errors);
8413
8414 switch (*error_handler) {
8415 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008416 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008417 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008418
8419 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008420 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 x = charmapencode_output('?', mapping, res, respos);
8422 if (x==enc_EXCEPTION) {
8423 return -1;
8424 }
8425 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008426 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 return -1;
8428 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008429 }
8430 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008431 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432 *inpos = collendpos;
8433 break;
Victor Stinner50149202015-09-22 00:26:54 +02008434
8435 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008436 /* generate replacement (temporarily (mis)uses p) */
8437 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 char buffer[2+29+1+1];
8439 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008440 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 for (cp = buffer; *cp; ++cp) {
8442 x = charmapencode_output(*cp, mapping, res, respos);
8443 if (x==enc_EXCEPTION)
8444 return -1;
8445 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008446 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 return -1;
8448 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008449 }
8450 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008451 *inpos = collendpos;
8452 break;
Victor Stinner50149202015-09-22 00:26:54 +02008453
Benjamin Peterson14339b62009-01-31 16:36:08 +00008454 default:
Victor Stinner50149202015-09-22 00:26:54 +02008455 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008456 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008460 if (PyBytes_Check(repunicode)) {
8461 /* Directly copy bytes result to output. */
8462 Py_ssize_t outsize = PyBytes_Size(*res);
8463 Py_ssize_t requiredsize;
8464 repsize = PyBytes_Size(repunicode);
8465 requiredsize = *respos + repsize;
8466 if (requiredsize > outsize)
8467 /* Make room for all additional bytes. */
8468 if (charmapencode_resize(res, respos, requiredsize)) {
8469 Py_DECREF(repunicode);
8470 return -1;
8471 }
8472 memcpy(PyBytes_AsString(*res) + *respos,
8473 PyBytes_AsString(repunicode), repsize);
8474 *respos += repsize;
8475 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008476 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008477 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008478 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008479 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008480 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008481 Py_DECREF(repunicode);
8482 return -1;
8483 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008484 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008485 data = PyUnicode_DATA(repunicode);
8486 kind = PyUnicode_KIND(repunicode);
8487 for (index = 0; index < repsize; index++) {
8488 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8489 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008491 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 return -1;
8493 }
8494 else if (x==enc_FAILED) {
8495 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008496 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 return -1;
8498 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008499 }
8500 *inpos = newpos;
8501 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008502 }
8503 return 0;
8504}
8505
Alexander Belopolsky40018472011-02-26 01:02:56 +00008506PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008507_PyUnicode_EncodeCharmap(PyObject *unicode,
8508 PyObject *mapping,
8509 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008511 /* output object */
8512 PyObject *res = NULL;
8513 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008514 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008515 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008516 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008517 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008518 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008520 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008521 void *data;
8522 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523
Benjamin Petersonbac79492012-01-14 13:34:47 -05008524 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008525 return NULL;
8526 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008527 data = PyUnicode_DATA(unicode);
8528 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008529
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530 /* Default to Latin-1 */
8531 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008532 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534 /* allocate enough for a simple encoding without
8535 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008536 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 if (res == NULL)
8538 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008539 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008542 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008543 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008545 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 if (x==enc_EXCEPTION) /* error */
8547 goto onError;
8548 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008549 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008551 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 &res, &respos)) {
8553 goto onError;
8554 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008555 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 else
8557 /* done with this character => adjust input position */
8558 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008562 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008563 if (_PyBytes_Resize(&res, respos) < 0)
8564 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008565
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008566 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008567 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008568 return res;
8569
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008571 Py_XDECREF(res);
8572 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008573 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574 return NULL;
8575}
8576
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008577/* Deprecated */
8578PyObject *
8579PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8580 Py_ssize_t size,
8581 PyObject *mapping,
8582 const char *errors)
8583{
8584 PyObject *result;
8585 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8586 if (unicode == NULL)
8587 return NULL;
8588 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8589 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008590 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008591}
8592
Alexander Belopolsky40018472011-02-26 01:02:56 +00008593PyObject *
8594PyUnicode_AsCharmapString(PyObject *unicode,
8595 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596{
8597 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 PyErr_BadArgument();
8599 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008601 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602}
8603
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008604/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008605static void
8606make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008608 Py_ssize_t startpos, Py_ssize_t endpos,
8609 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008611 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 *exceptionObject = _PyUnicodeTranslateError_Create(
8613 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614 }
8615 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8617 goto onError;
8618 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8619 goto onError;
8620 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8621 goto onError;
8622 return;
8623 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008624 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 }
8626}
8627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008628/* error handling callback helper:
8629 build arguments, call the callback and check the arguments,
8630 put the result into newpos and return the replacement string, which
8631 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008632static PyObject *
8633unicode_translate_call_errorhandler(const char *errors,
8634 PyObject **errorHandler,
8635 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008637 Py_ssize_t startpos, Py_ssize_t endpos,
8638 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008639{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02008640 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008641
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008642 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643 PyObject *restuple;
8644 PyObject *resunicode;
8645
8646 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008647 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008648 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008650 }
8651
8652 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008654 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008656
8657 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008659 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008662 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 Py_DECREF(restuple);
8664 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665 }
8666 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 &resunicode, &i_newpos)) {
8668 Py_DECREF(restuple);
8669 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008671 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008672 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008673 else
8674 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008675 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008676 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 Py_DECREF(restuple);
8678 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008679 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680 Py_INCREF(resunicode);
8681 Py_DECREF(restuple);
8682 return resunicode;
8683}
8684
8685/* Lookup the character ch in the mapping and put the result in result,
8686 which must be decrefed by the caller.
8687 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008688static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008689charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008690{
Christian Heimes217cfd12007-12-02 14:31:20 +00008691 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008692 PyObject *x;
8693
8694 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696 x = PyObject_GetItem(mapping, w);
8697 Py_DECREF(w);
8698 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8700 /* No mapping found means: use 1:1 mapping. */
8701 PyErr_Clear();
8702 *result = NULL;
8703 return 0;
8704 } else
8705 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008706 }
8707 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 *result = x;
8709 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008710 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008711 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008713 if (value < 0 || value > MAX_UNICODE) {
8714 PyErr_Format(PyExc_ValueError,
8715 "character mapping must be in range(0x%x)",
8716 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 Py_DECREF(x);
8718 return -1;
8719 }
8720 *result = x;
8721 return 0;
8722 }
8723 else if (PyUnicode_Check(x)) {
8724 *result = x;
8725 return 0;
8726 }
8727 else {
8728 /* wrong return value */
8729 PyErr_SetString(PyExc_TypeError,
8730 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008731 Py_DECREF(x);
8732 return -1;
8733 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008734}
Victor Stinner1194ea02014-04-04 19:37:40 +02008735
8736/* lookup the character, write the result into the writer.
8737 Return 1 if the result was written into the writer, return 0 if the mapping
8738 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008739static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008740charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8741 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008742{
Victor Stinner1194ea02014-04-04 19:37:40 +02008743 PyObject *item;
8744
8745 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008747
8748 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008749 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008750 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008753 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008754 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008755
8756 if (item == Py_None) {
8757 Py_DECREF(item);
8758 return 0;
8759 }
8760
8761 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008762 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8763 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8764 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008765 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8766 Py_DECREF(item);
8767 return -1;
8768 }
8769 Py_DECREF(item);
8770 return 1;
8771 }
8772
8773 if (!PyUnicode_Check(item)) {
8774 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008776 }
8777
8778 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8779 Py_DECREF(item);
8780 return -1;
8781 }
8782
8783 Py_DECREF(item);
8784 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785}
8786
Victor Stinner89a76ab2014-04-05 11:44:04 +02008787static int
8788unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8789 Py_UCS1 *translate)
8790{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008791 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008792 int ret = 0;
8793
Victor Stinner89a76ab2014-04-05 11:44:04 +02008794 if (charmaptranslate_lookup(ch, mapping, &item)) {
8795 return -1;
8796 }
8797
8798 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008799 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008800 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008801 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008802 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008803 /* not found => default to 1:1 mapping */
8804 translate[ch] = ch;
8805 return 1;
8806 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008807 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008808 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008809 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8810 used it */
8811 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008812 /* invalid character or character outside ASCII:
8813 skip the fast translate */
8814 goto exit;
8815 }
8816 translate[ch] = (Py_UCS1)replace;
8817 }
8818 else if (PyUnicode_Check(item)) {
8819 Py_UCS4 replace;
8820
8821 if (PyUnicode_READY(item) == -1) {
8822 Py_DECREF(item);
8823 return -1;
8824 }
8825 if (PyUnicode_GET_LENGTH(item) != 1)
8826 goto exit;
8827
8828 replace = PyUnicode_READ_CHAR(item, 0);
8829 if (replace > 127)
8830 goto exit;
8831 translate[ch] = (Py_UCS1)replace;
8832 }
8833 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008834 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008835 goto exit;
8836 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008837 ret = 1;
8838
Benjamin Peterson1365de72014-04-07 20:15:41 -04008839 exit:
8840 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008841 return ret;
8842}
8843
8844/* Fast path for ascii => ascii translation. Return 1 if the whole string
8845 was translated into writer, return 0 if the input string was partially
8846 translated into writer, raise an exception and return -1 on error. */
8847static int
8848unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008849 _PyUnicodeWriter *writer, int ignore,
8850 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008851{
Victor Stinner872b2912014-04-05 14:27:07 +02008852 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008853 Py_ssize_t len;
8854 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008855 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008856
Victor Stinner89a76ab2014-04-05 11:44:04 +02008857 len = PyUnicode_GET_LENGTH(input);
8858
Victor Stinner872b2912014-04-05 14:27:07 +02008859 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008860
8861 in = PyUnicode_1BYTE_DATA(input);
8862 end = in + len;
8863
8864 assert(PyUnicode_IS_ASCII(writer->buffer));
8865 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8866 out = PyUnicode_1BYTE_DATA(writer->buffer);
8867
Victor Stinner872b2912014-04-05 14:27:07 +02008868 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008869 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008870 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008871 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008872 int translate = unicode_fast_translate_lookup(mapping, ch,
8873 ascii_table);
8874 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008875 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008876 if (translate == 0)
8877 goto exit;
8878 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008879 }
Victor Stinner872b2912014-04-05 14:27:07 +02008880 if (ch2 == 0xfe) {
8881 if (ignore)
8882 continue;
8883 goto exit;
8884 }
8885 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008886 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008887 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008888 }
Victor Stinner872b2912014-04-05 14:27:07 +02008889 res = 1;
8890
8891exit:
8892 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008893 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008894 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008895}
8896
Victor Stinner3222da22015-10-01 22:07:32 +02008897static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898_PyUnicode_TranslateCharmap(PyObject *input,
8899 PyObject *mapping,
8900 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008903 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904 Py_ssize_t size, i;
8905 int kind;
8906 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008907 _PyUnicodeWriter writer;
8908 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008909 char *reason = "character maps to <undefined>";
8910 PyObject *errorHandler = NULL;
8911 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008912 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008913 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008914
Guido van Rossumd57fd912000-03-10 22:53:23 +00008915 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 PyErr_BadArgument();
8917 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008918 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920 if (PyUnicode_READY(input) == -1)
8921 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008922 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923 kind = PyUnicode_KIND(input);
8924 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008926 if (size == 0)
8927 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008929 /* allocate enough for a simple 1:1 translation without
8930 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008931 _PyUnicodeWriter_Init(&writer);
8932 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934
Victor Stinner872b2912014-04-05 14:27:07 +02008935 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8936
Victor Stinner33798672016-03-01 21:59:58 +01008937 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008938 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008939 if (PyUnicode_IS_ASCII(input)) {
8940 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8941 if (res < 0) {
8942 _PyUnicodeWriter_Dealloc(&writer);
8943 return NULL;
8944 }
8945 if (res == 1)
8946 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008947 }
Victor Stinner33798672016-03-01 21:59:58 +01008948 else {
8949 i = 0;
8950 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008953 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008954 int translate;
8955 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8956 Py_ssize_t newpos;
8957 /* startpos for collecting untranslatable chars */
8958 Py_ssize_t collstart;
8959 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008960 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961
Victor Stinner1194ea02014-04-04 19:37:40 +02008962 ch = PyUnicode_READ(kind, data, i);
8963 translate = charmaptranslate_output(ch, mapping, &writer);
8964 if (translate < 0)
8965 goto onError;
8966
8967 if (translate != 0) {
8968 /* it worked => adjust input pointer */
8969 ++i;
8970 continue;
8971 }
8972
8973 /* untranslatable character */
8974 collstart = i;
8975 collend = i+1;
8976
8977 /* find all untranslatable characters */
8978 while (collend < size) {
8979 PyObject *x;
8980 ch = PyUnicode_READ(kind, data, collend);
8981 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008982 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008983 Py_XDECREF(x);
8984 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008985 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008986 ++collend;
8987 }
8988
8989 if (ignore) {
8990 i = collend;
8991 }
8992 else {
8993 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8994 reason, input, &exc,
8995 collstart, collend, &newpos);
8996 if (repunicode == NULL)
8997 goto onError;
8998 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009000 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009001 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009002 Py_DECREF(repunicode);
9003 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009004 }
9005 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009006 Py_XDECREF(exc);
9007 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009008 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009
Benjamin Peterson29060642009-01-31 22:14:21 +00009010 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009011 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009012 Py_XDECREF(exc);
9013 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014 return NULL;
9015}
9016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017/* Deprecated. Use PyUnicode_Translate instead. */
9018PyObject *
9019PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9020 Py_ssize_t size,
9021 PyObject *mapping,
9022 const char *errors)
9023{
Christian Heimes5f520f42012-09-11 14:03:25 +02009024 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009025 PyObject *unicode = PyUnicode_FromUnicode(p, size);
9026 if (!unicode)
9027 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009028 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9029 Py_DECREF(unicode);
9030 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031}
9032
Alexander Belopolsky40018472011-02-26 01:02:56 +00009033PyObject *
9034PyUnicode_Translate(PyObject *str,
9035 PyObject *mapping,
9036 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009038 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009039 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009040 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041}
Tim Petersced69f82003-09-16 20:30:58 +00009042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009043static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009044fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045{
9046 /* No need to call PyUnicode_READY(self) because this function is only
9047 called as a callback from fixup() which does it already. */
9048 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9049 const int kind = PyUnicode_KIND(self);
9050 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009051 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009052 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053 Py_ssize_t i;
9054
9055 for (i = 0; i < len; ++i) {
9056 ch = PyUnicode_READ(kind, data, i);
9057 fixed = 0;
9058 if (ch > 127) {
9059 if (Py_UNICODE_ISSPACE(ch))
9060 fixed = ' ';
9061 else {
9062 const int decimal = Py_UNICODE_TODECIMAL(ch);
9063 if (decimal >= 0)
9064 fixed = '0' + decimal;
9065 }
9066 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009067 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009068 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069 PyUnicode_WRITE(kind, data, i, fixed);
9070 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009071 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009072 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074 }
9075
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009076 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077}
9078
9079PyObject *
9080_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9081{
9082 if (!PyUnicode_Check(unicode)) {
9083 PyErr_BadInternalCall();
9084 return NULL;
9085 }
9086 if (PyUnicode_READY(unicode) == -1)
9087 return NULL;
9088 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9089 /* If the string is already ASCII, just return the same string */
9090 Py_INCREF(unicode);
9091 return unicode;
9092 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009093 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009094}
9095
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009096PyObject *
9097PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9098 Py_ssize_t length)
9099{
Victor Stinnerf0124502011-11-21 23:12:56 +01009100 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009101 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009102 Py_UCS4 maxchar;
9103 enum PyUnicode_Kind kind;
9104 void *data;
9105
Victor Stinner99d7ad02012-02-22 13:37:39 +01009106 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009107 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009108 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009109 if (ch > 127) {
9110 int decimal = Py_UNICODE_TODECIMAL(ch);
9111 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009112 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009113 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009114 }
9115 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009116
9117 /* Copy to a new string */
9118 decimal = PyUnicode_New(length, maxchar);
9119 if (decimal == NULL)
9120 return decimal;
9121 kind = PyUnicode_KIND(decimal);
9122 data = PyUnicode_DATA(decimal);
9123 /* Iterate over code points */
9124 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009125 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009126 if (ch > 127) {
9127 int decimal = Py_UNICODE_TODECIMAL(ch);
9128 if (decimal >= 0)
9129 ch = '0' + decimal;
9130 }
9131 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009132 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009133 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009134}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009135/* --- Decimal Encoder ---------------------------------------------------- */
9136
Alexander Belopolsky40018472011-02-26 01:02:56 +00009137int
9138PyUnicode_EncodeDecimal(Py_UNICODE *s,
9139 Py_ssize_t length,
9140 char *output,
9141 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009142{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009143 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009144 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009145 enum PyUnicode_Kind kind;
9146 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009147
9148 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009149 PyErr_BadArgument();
9150 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009151 }
9152
Victor Stinner42bf7752011-11-21 22:52:58 +01009153 unicode = PyUnicode_FromUnicode(s, length);
9154 if (unicode == NULL)
9155 return -1;
9156
Benjamin Petersonbac79492012-01-14 13:34:47 -05009157 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009158 Py_DECREF(unicode);
9159 return -1;
9160 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009161 kind = PyUnicode_KIND(unicode);
9162 data = PyUnicode_DATA(unicode);
9163
Victor Stinnerb84d7232011-11-22 01:50:07 +01009164 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009165 PyObject *exc;
9166 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009168 Py_ssize_t startpos;
9169
9170 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009171
Benjamin Peterson29060642009-01-31 22:14:21 +00009172 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009173 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009174 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009175 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009176 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009177 decimal = Py_UNICODE_TODECIMAL(ch);
9178 if (decimal >= 0) {
9179 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009180 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009181 continue;
9182 }
9183 if (0 < ch && ch < 256) {
9184 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009185 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 continue;
9187 }
Victor Stinner6345be92011-11-25 20:09:01 +01009188
Victor Stinner42bf7752011-11-21 22:52:58 +01009189 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009190 exc = NULL;
9191 raise_encode_exception(&exc, "decimal", unicode,
9192 startpos, startpos+1,
9193 "invalid decimal Unicode string");
9194 Py_XDECREF(exc);
9195 Py_DECREF(unicode);
9196 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009197 }
9198 /* 0-terminate the output string */
9199 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009200 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009201 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009202}
9203
Guido van Rossumd57fd912000-03-10 22:53:23 +00009204/* --- Helpers ------------------------------------------------------------ */
9205
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009206/* helper macro to fixup start/end slice values */
9207#define ADJUST_INDICES(start, end, len) \
9208 if (end > len) \
9209 end = len; \
9210 else if (end < 0) { \
9211 end += len; \
9212 if (end < 0) \
9213 end = 0; \
9214 } \
9215 if (start < 0) { \
9216 start += len; \
9217 if (start < 0) \
9218 start = 0; \
9219 }
9220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009221static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009222any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009224 Py_ssize_t end,
9225 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009227 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009228 void *buf1, *buf2;
9229 Py_ssize_t len1, len2, result;
9230
9231 kind1 = PyUnicode_KIND(s1);
9232 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009233 if (kind1 < kind2)
9234 return -1;
9235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009236 len1 = PyUnicode_GET_LENGTH(s1);
9237 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009238 ADJUST_INDICES(start, end, len1);
9239 if (end - start < len2)
9240 return -1;
9241
9242 buf1 = PyUnicode_DATA(s1);
9243 buf2 = PyUnicode_DATA(s2);
9244 if (len2 == 1) {
9245 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9246 result = findchar((const char *)buf1 + kind1*start,
9247 kind1, end - start, ch, direction);
9248 if (result == -1)
9249 return -1;
9250 else
9251 return start + result;
9252 }
9253
9254 if (kind2 != kind1) {
9255 buf2 = _PyUnicode_AsKind(s2, kind1);
9256 if (!buf2)
9257 return -2;
9258 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009259
Victor Stinner794d5672011-10-10 03:21:36 +02009260 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009261 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009262 case PyUnicode_1BYTE_KIND:
9263 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9264 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9265 else
9266 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9267 break;
9268 case PyUnicode_2BYTE_KIND:
9269 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9270 break;
9271 case PyUnicode_4BYTE_KIND:
9272 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9273 break;
9274 default:
9275 assert(0); result = -2;
9276 }
9277 }
9278 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009279 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009280 case PyUnicode_1BYTE_KIND:
9281 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9282 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9283 else
9284 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9285 break;
9286 case PyUnicode_2BYTE_KIND:
9287 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9288 break;
9289 case PyUnicode_4BYTE_KIND:
9290 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9291 break;
9292 default:
9293 assert(0); result = -2;
9294 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 }
9296
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009297 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 PyMem_Free(buf2);
9299
9300 return result;
9301}
9302
9303Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009304_PyUnicode_InsertThousandsGrouping(
9305 PyObject *unicode, Py_ssize_t index,
9306 Py_ssize_t n_buffer,
9307 void *digits, Py_ssize_t n_digits,
9308 Py_ssize_t min_width,
9309 const char *grouping, PyObject *thousands_sep,
9310 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311{
Victor Stinner41a863c2012-02-24 00:37:51 +01009312 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009313 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009314 Py_ssize_t thousands_sep_len;
9315 Py_ssize_t len;
9316
9317 if (unicode != NULL) {
9318 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009319 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009320 }
9321 else {
9322 kind = PyUnicode_1BYTE_KIND;
9323 data = NULL;
9324 }
9325 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9326 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9327 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9328 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009329 if (thousands_sep_kind < kind) {
9330 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9331 if (!thousands_sep_data)
9332 return -1;
9333 }
9334 else {
9335 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9336 if (!data)
9337 return -1;
9338 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009339 }
9340
Benjamin Petersonead6b532011-12-20 17:23:42 -06009341 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009343 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009344 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009345 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009346 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009347 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009348 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009349 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009350 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009351 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009352 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009353 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009355 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009356 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009357 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009358 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009359 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009361 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009362 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009363 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009364 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009365 break;
9366 default:
9367 assert(0);
9368 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009369 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009370 if (unicode != NULL && thousands_sep_kind != kind) {
9371 if (thousands_sep_kind < kind)
9372 PyMem_Free(thousands_sep_data);
9373 else
9374 PyMem_Free(data);
9375 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009376 if (unicode == NULL) {
9377 *maxchar = 127;
9378 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009379 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009380 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009381 }
9382 }
9383 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384}
9385
9386
Alexander Belopolsky40018472011-02-26 01:02:56 +00009387Py_ssize_t
9388PyUnicode_Count(PyObject *str,
9389 PyObject *substr,
9390 Py_ssize_t start,
9391 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009393 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009394 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 void *buf1 = NULL, *buf2 = NULL;
9396 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009397
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009398 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009399 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009400
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009401 kind1 = PyUnicode_KIND(str);
9402 kind2 = PyUnicode_KIND(substr);
9403 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009404 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009405
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009406 len1 = PyUnicode_GET_LENGTH(str);
9407 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009409 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009410 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009411
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009412 buf1 = PyUnicode_DATA(str);
9413 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009414 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009415 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009416 if (!buf2)
9417 goto onError;
9418 }
9419
9420 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009422 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009423 result = asciilib_count(
9424 ((Py_UCS1*)buf1) + start, end - start,
9425 buf2, len2, PY_SSIZE_T_MAX
9426 );
9427 else
9428 result = ucs1lib_count(
9429 ((Py_UCS1*)buf1) + start, end - start,
9430 buf2, len2, PY_SSIZE_T_MAX
9431 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 break;
9433 case PyUnicode_2BYTE_KIND:
9434 result = ucs2lib_count(
9435 ((Py_UCS2*)buf1) + start, end - start,
9436 buf2, len2, PY_SSIZE_T_MAX
9437 );
9438 break;
9439 case PyUnicode_4BYTE_KIND:
9440 result = ucs4lib_count(
9441 ((Py_UCS4*)buf1) + start, end - start,
9442 buf2, len2, PY_SSIZE_T_MAX
9443 );
9444 break;
9445 default:
9446 assert(0); result = 0;
9447 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009448
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009449 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 PyMem_Free(buf2);
9451
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009454 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 PyMem_Free(buf2);
9456 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457}
9458
Alexander Belopolsky40018472011-02-26 01:02:56 +00009459Py_ssize_t
9460PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009461 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009462 Py_ssize_t start,
9463 Py_ssize_t end,
9464 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009466 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009467 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009468
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009469 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470}
9471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472Py_ssize_t
9473PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9474 Py_ssize_t start, Py_ssize_t end,
9475 int direction)
9476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009478 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 if (PyUnicode_READY(str) == -1)
9480 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009481 if (start < 0 || end < 0) {
9482 PyErr_SetString(PyExc_IndexError, "string index out of range");
9483 return -2;
9484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 if (end > PyUnicode_GET_LENGTH(str))
9486 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009487 if (start >= end)
9488 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009490 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9491 kind, end-start, ch, direction);
9492 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009494 else
9495 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496}
9497
Alexander Belopolsky40018472011-02-26 01:02:56 +00009498static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009499tailmatch(PyObject *self,
9500 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009501 Py_ssize_t start,
9502 Py_ssize_t end,
9503 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505 int kind_self;
9506 int kind_sub;
9507 void *data_self;
9508 void *data_sub;
9509 Py_ssize_t offset;
9510 Py_ssize_t i;
9511 Py_ssize_t end_sub;
9512
9513 if (PyUnicode_READY(self) == -1 ||
9514 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009515 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009517 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9518 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009520 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009521
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009522 if (PyUnicode_GET_LENGTH(substring) == 0)
9523 return 1;
9524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009525 kind_self = PyUnicode_KIND(self);
9526 data_self = PyUnicode_DATA(self);
9527 kind_sub = PyUnicode_KIND(substring);
9528 data_sub = PyUnicode_DATA(substring);
9529 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9530
9531 if (direction > 0)
9532 offset = end;
9533 else
9534 offset = start;
9535
9536 if (PyUnicode_READ(kind_self, data_self, offset) ==
9537 PyUnicode_READ(kind_sub, data_sub, 0) &&
9538 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9539 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9540 /* If both are of the same kind, memcmp is sufficient */
9541 if (kind_self == kind_sub) {
9542 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009543 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009544 data_sub,
9545 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009546 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009547 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009548 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 else {
9550 /* We do not need to compare 0 and len(substring)-1 because
9551 the if statement above ensured already that they are equal
9552 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009553 for (i = 1; i < end_sub; ++i) {
9554 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9555 PyUnicode_READ(kind_sub, data_sub, i))
9556 return 0;
9557 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009558 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009560 }
9561
9562 return 0;
9563}
9564
Alexander Belopolsky40018472011-02-26 01:02:56 +00009565Py_ssize_t
9566PyUnicode_Tailmatch(PyObject *str,
9567 PyObject *substr,
9568 Py_ssize_t start,
9569 Py_ssize_t end,
9570 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009572 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009573 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009574
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009575 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009576}
9577
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578/* Apply fixfct filter to the Unicode object self and return a
9579 reference to the modified object */
9580
Alexander Belopolsky40018472011-02-26 01:02:56 +00009581static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009582fixup(PyObject *self,
9583 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009585 PyObject *u;
9586 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009587 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009588
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009589 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009591 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009592 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 /* fix functions return the new maximum character in a string,
9595 if the kind of the resulting unicode object does not change,
9596 everything is fine. Otherwise we need to change the string kind
9597 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009598 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009599
9600 if (maxchar_new == 0) {
9601 /* no changes */;
9602 if (PyUnicode_CheckExact(self)) {
9603 Py_DECREF(u);
9604 Py_INCREF(self);
9605 return self;
9606 }
9607 else
9608 return u;
9609 }
9610
Victor Stinnere6abb482012-05-02 01:15:40 +02009611 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612
Victor Stinnereaab6042011-12-11 22:22:39 +01009613 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009614 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009615
9616 /* In case the maximum character changed, we need to
9617 convert the string to the new category. */
9618 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9619 if (v == NULL) {
9620 Py_DECREF(u);
9621 return NULL;
9622 }
9623 if (maxchar_new > maxchar_old) {
9624 /* If the maxchar increased so that the kind changed, not all
9625 characters are representable anymore and we need to fix the
9626 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009627 _PyUnicode_FastCopyCharacters(v, 0,
9628 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009629 maxchar_old = fixfct(v);
9630 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 }
9632 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009633 _PyUnicode_FastCopyCharacters(v, 0,
9634 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009636 Py_DECREF(u);
9637 assert(_PyUnicode_CheckConsistency(v, 1));
9638 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639}
9640
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009641static PyObject *
9642ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009644 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9645 char *resdata, *data = PyUnicode_DATA(self);
9646 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009647
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009648 res = PyUnicode_New(len, 127);
9649 if (res == NULL)
9650 return NULL;
9651 resdata = PyUnicode_DATA(res);
9652 if (lower)
9653 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009655 _Py_bytes_upper(resdata, data, len);
9656 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657}
9658
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009660handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662 Py_ssize_t j;
9663 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009664 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009665 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009666
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009667 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9668
9669 where ! is a negation and \p{xxx} is a character with property xxx.
9670 */
9671 for (j = i - 1; j >= 0; j--) {
9672 c = PyUnicode_READ(kind, data, j);
9673 if (!_PyUnicode_IsCaseIgnorable(c))
9674 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009676 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9677 if (final_sigma) {
9678 for (j = i + 1; j < length; j++) {
9679 c = PyUnicode_READ(kind, data, j);
9680 if (!_PyUnicode_IsCaseIgnorable(c))
9681 break;
9682 }
9683 final_sigma = j == length || !_PyUnicode_IsCased(c);
9684 }
9685 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686}
9687
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009688static int
9689lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9690 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009692 /* Obscure special case. */
9693 if (c == 0x3A3) {
9694 mapped[0] = handle_capital_sigma(kind, data, length, i);
9695 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009697 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698}
9699
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009700static Py_ssize_t
9701do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009703 Py_ssize_t i, k = 0;
9704 int n_res, j;
9705 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009706
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009707 c = PyUnicode_READ(kind, data, 0);
9708 n_res = _PyUnicode_ToUpperFull(c, mapped);
9709 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009710 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009712 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009713 for (i = 1; i < length; i++) {
9714 c = PyUnicode_READ(kind, data, i);
9715 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9716 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009717 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009718 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009719 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009720 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009721 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009722}
9723
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009724static Py_ssize_t
9725do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9726 Py_ssize_t i, k = 0;
9727
9728 for (i = 0; i < length; i++) {
9729 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9730 int n_res, j;
9731 if (Py_UNICODE_ISUPPER(c)) {
9732 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9733 }
9734 else if (Py_UNICODE_ISLOWER(c)) {
9735 n_res = _PyUnicode_ToUpperFull(c, mapped);
9736 }
9737 else {
9738 n_res = 1;
9739 mapped[0] = c;
9740 }
9741 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009742 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009743 res[k++] = mapped[j];
9744 }
9745 }
9746 return k;
9747}
9748
9749static Py_ssize_t
9750do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9751 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009752{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009753 Py_ssize_t i, k = 0;
9754
9755 for (i = 0; i < length; i++) {
9756 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9757 int n_res, j;
9758 if (lower)
9759 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9760 else
9761 n_res = _PyUnicode_ToUpperFull(c, mapped);
9762 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009763 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009764 res[k++] = mapped[j];
9765 }
9766 }
9767 return k;
9768}
9769
9770static Py_ssize_t
9771do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9772{
9773 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9774}
9775
9776static Py_ssize_t
9777do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9778{
9779 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9780}
9781
Benjamin Petersone51757f2012-01-12 21:10:29 -05009782static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009783do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9784{
9785 Py_ssize_t i, k = 0;
9786
9787 for (i = 0; i < length; i++) {
9788 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9789 Py_UCS4 mapped[3];
9790 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9791 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009792 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009793 res[k++] = mapped[j];
9794 }
9795 }
9796 return k;
9797}
9798
9799static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009800do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9801{
9802 Py_ssize_t i, k = 0;
9803 int previous_is_cased;
9804
9805 previous_is_cased = 0;
9806 for (i = 0; i < length; i++) {
9807 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9808 Py_UCS4 mapped[3];
9809 int n_res, j;
9810
9811 if (previous_is_cased)
9812 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9813 else
9814 n_res = _PyUnicode_ToTitleFull(c, mapped);
9815
9816 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009817 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009818 res[k++] = mapped[j];
9819 }
9820
9821 previous_is_cased = _PyUnicode_IsCased(c);
9822 }
9823 return k;
9824}
9825
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009826static PyObject *
9827case_operation(PyObject *self,
9828 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9829{
9830 PyObject *res = NULL;
9831 Py_ssize_t length, newlength = 0;
9832 int kind, outkind;
9833 void *data, *outdata;
9834 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9835
Benjamin Petersoneea48462012-01-16 14:28:50 -05009836 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009837
9838 kind = PyUnicode_KIND(self);
9839 data = PyUnicode_DATA(self);
9840 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009841 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009842 PyErr_SetString(PyExc_OverflowError, "string is too long");
9843 return NULL;
9844 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009845 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009846 if (tmp == NULL)
9847 return PyErr_NoMemory();
9848 newlength = perform(kind, data, length, tmp, &maxchar);
9849 res = PyUnicode_New(newlength, maxchar);
9850 if (res == NULL)
9851 goto leave;
9852 tmpend = tmp + newlength;
9853 outdata = PyUnicode_DATA(res);
9854 outkind = PyUnicode_KIND(res);
9855 switch (outkind) {
9856 case PyUnicode_1BYTE_KIND:
9857 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9858 break;
9859 case PyUnicode_2BYTE_KIND:
9860 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9861 break;
9862 case PyUnicode_4BYTE_KIND:
9863 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9864 break;
9865 default:
9866 assert(0);
9867 break;
9868 }
9869 leave:
9870 PyMem_FREE(tmp);
9871 return res;
9872}
9873
Tim Peters8ce9f162004-08-27 01:49:32 +00009874PyObject *
9875PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009876{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009877 PyObject *res;
9878 PyObject *fseq;
9879 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009880 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009882 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009883 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009884 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009885 }
9886
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009887 /* NOTE: the following code can't call back into Python code,
9888 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009889 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009890
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009891 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009892 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009893 res = _PyUnicode_JoinArray(separator, items, seqlen);
9894 Py_DECREF(fseq);
9895 return res;
9896}
9897
9898PyObject *
9899_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9900{
9901 PyObject *res = NULL; /* the result */
9902 PyObject *sep = NULL;
9903 Py_ssize_t seplen;
9904 PyObject *item;
9905 Py_ssize_t sz, i, res_offset;
9906 Py_UCS4 maxchar;
9907 Py_UCS4 item_maxchar;
9908 int use_memcpy;
9909 unsigned char *res_data = NULL, *sep_data = NULL;
9910 PyObject *last_obj;
9911 unsigned int kind = 0;
9912
Tim Peters05eba1f2004-08-27 21:32:02 +00009913 /* If empty sequence, return u"". */
9914 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009915 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009916 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009917
Tim Peters05eba1f2004-08-27 21:32:02 +00009918 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009919 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009920 if (seqlen == 1) {
9921 if (PyUnicode_CheckExact(items[0])) {
9922 res = items[0];
9923 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009924 return res;
9925 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009926 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009927 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009928 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009929 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009930 /* Set up sep and seplen */
9931 if (separator == NULL) {
9932 /* fall back to a blank space separator */
9933 sep = PyUnicode_FromOrdinal(' ');
9934 if (!sep)
9935 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009936 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009937 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009938 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009939 else {
9940 if (!PyUnicode_Check(separator)) {
9941 PyErr_Format(PyExc_TypeError,
9942 "separator: expected str instance,"
9943 " %.80s found",
9944 Py_TYPE(separator)->tp_name);
9945 goto onError;
9946 }
9947 if (PyUnicode_READY(separator))
9948 goto onError;
9949 sep = separator;
9950 seplen = PyUnicode_GET_LENGTH(separator);
9951 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9952 /* inc refcount to keep this code path symmetric with the
9953 above case of a blank separator */
9954 Py_INCREF(sep);
9955 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009956 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009957 }
9958
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009959 /* There are at least two things to join, or else we have a subclass
9960 * of str in the sequence.
9961 * Do a pre-pass to figure out the total amount of space we'll
9962 * need (sz), and see whether all argument are strings.
9963 */
9964 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009965#ifdef Py_DEBUG
9966 use_memcpy = 0;
9967#else
9968 use_memcpy = 1;
9969#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009970 for (i = 0; i < seqlen; i++) {
9971 const Py_ssize_t old_sz = sz;
9972 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009973 if (!PyUnicode_Check(item)) {
9974 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009975 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009976 " %.80s found",
9977 i, Py_TYPE(item)->tp_name);
9978 goto onError;
9979 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 if (PyUnicode_READY(item) == -1)
9981 goto onError;
9982 sz += PyUnicode_GET_LENGTH(item);
9983 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009984 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009985 if (i != 0)
9986 sz += seplen;
9987 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9988 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009989 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009990 goto onError;
9991 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009992 if (use_memcpy && last_obj != NULL) {
9993 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9994 use_memcpy = 0;
9995 }
9996 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009997 }
Tim Petersced69f82003-09-16 20:30:58 +00009998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010000 if (res == NULL)
10001 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010002
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010003 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010004#ifdef Py_DEBUG
10005 use_memcpy = 0;
10006#else
10007 if (use_memcpy) {
10008 res_data = PyUnicode_1BYTE_DATA(res);
10009 kind = PyUnicode_KIND(res);
10010 if (seplen != 0)
10011 sep_data = PyUnicode_1BYTE_DATA(sep);
10012 }
10013#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010014 if (use_memcpy) {
10015 for (i = 0; i < seqlen; ++i) {
10016 Py_ssize_t itemlen;
10017 item = items[i];
10018
10019 /* Copy item, and maybe the separator. */
10020 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010021 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010022 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010023 kind * seplen);
10024 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010025 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010026
10027 itemlen = PyUnicode_GET_LENGTH(item);
10028 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010029 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010030 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010031 kind * itemlen);
10032 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010033 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010034 }
10035 assert(res_data == PyUnicode_1BYTE_DATA(res)
10036 + kind * PyUnicode_GET_LENGTH(res));
10037 }
10038 else {
10039 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10040 Py_ssize_t itemlen;
10041 item = items[i];
10042
10043 /* Copy item, and maybe the separator. */
10044 if (i && seplen != 0) {
10045 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10046 res_offset += seplen;
10047 }
10048
10049 itemlen = PyUnicode_GET_LENGTH(item);
10050 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010051 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010052 res_offset += itemlen;
10053 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010054 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010055 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010056 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010059 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010061
Benjamin Peterson29060642009-01-31 22:14:21 +000010062 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010064 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010065 return NULL;
10066}
10067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068#define FILL(kind, data, value, start, length) \
10069 do { \
10070 Py_ssize_t i_ = 0; \
10071 assert(kind != PyUnicode_WCHAR_KIND); \
10072 switch ((kind)) { \
10073 case PyUnicode_1BYTE_KIND: { \
10074 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010075 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 break; \
10077 } \
10078 case PyUnicode_2BYTE_KIND: { \
10079 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10080 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10081 break; \
10082 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010083 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10085 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10086 break; \
10087 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010088 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 } \
10090 } while (0)
10091
Victor Stinnerd3f08822012-05-29 12:57:52 +020010092void
10093_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10094 Py_UCS4 fill_char)
10095{
10096 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10097 const void *data = PyUnicode_DATA(unicode);
10098 assert(PyUnicode_IS_READY(unicode));
10099 assert(unicode_modifiable(unicode));
10100 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10101 assert(start >= 0);
10102 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10103 FILL(kind, data, fill_char, start, length);
10104}
10105
Victor Stinner3fe55312012-01-04 00:33:50 +010010106Py_ssize_t
10107PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10108 Py_UCS4 fill_char)
10109{
10110 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010111
10112 if (!PyUnicode_Check(unicode)) {
10113 PyErr_BadInternalCall();
10114 return -1;
10115 }
10116 if (PyUnicode_READY(unicode) == -1)
10117 return -1;
10118 if (unicode_check_modifiable(unicode))
10119 return -1;
10120
Victor Stinnerd3f08822012-05-29 12:57:52 +020010121 if (start < 0) {
10122 PyErr_SetString(PyExc_IndexError, "string index out of range");
10123 return -1;
10124 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010125 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10126 PyErr_SetString(PyExc_ValueError,
10127 "fill character is bigger than "
10128 "the string maximum character");
10129 return -1;
10130 }
10131
10132 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10133 length = Py_MIN(maxlen, length);
10134 if (length <= 0)
10135 return 0;
10136
Victor Stinnerd3f08822012-05-29 12:57:52 +020010137 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010138 return length;
10139}
10140
Victor Stinner9310abb2011-10-05 00:59:23 +020010141static PyObject *
10142pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010143 Py_ssize_t left,
10144 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 PyObject *u;
10148 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010149 int kind;
10150 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151
10152 if (left < 0)
10153 left = 0;
10154 if (right < 0)
10155 right = 0;
10156
Victor Stinnerc4b49542011-12-11 22:44:26 +010010157 if (left == 0 && right == 0)
10158 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10161 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010162 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10163 return NULL;
10164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010166 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010168 if (!u)
10169 return NULL;
10170
10171 kind = PyUnicode_KIND(u);
10172 data = PyUnicode_DATA(u);
10173 if (left)
10174 FILL(kind, data, fill, 0, left);
10175 if (right)
10176 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010177 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010178 assert(_PyUnicode_CheckConsistency(u, 1));
10179 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180}
10181
Alexander Belopolsky40018472011-02-26 01:02:56 +000010182PyObject *
10183PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010185 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010187 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010188 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010189
Benjamin Petersonead6b532011-12-20 17:23:42 -060010190 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010192 if (PyUnicode_IS_ASCII(string))
10193 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010194 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010195 PyUnicode_GET_LENGTH(string), keepends);
10196 else
10197 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010198 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010199 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 break;
10201 case PyUnicode_2BYTE_KIND:
10202 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010203 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 PyUnicode_GET_LENGTH(string), keepends);
10205 break;
10206 case PyUnicode_4BYTE_KIND:
10207 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010208 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 PyUnicode_GET_LENGTH(string), keepends);
10210 break;
10211 default:
10212 assert(0);
10213 list = 0;
10214 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010215 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216}
10217
Alexander Belopolsky40018472011-02-26 01:02:56 +000010218static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010219split(PyObject *self,
10220 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010221 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010223 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 void *buf1, *buf2;
10225 Py_ssize_t len1, len2;
10226 PyObject* out;
10227
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010229 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 if (PyUnicode_READY(self) == -1)
10232 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010235 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010237 if (PyUnicode_IS_ASCII(self))
10238 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010239 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010240 PyUnicode_GET_LENGTH(self), maxcount
10241 );
10242 else
10243 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010244 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010245 PyUnicode_GET_LENGTH(self), maxcount
10246 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 case PyUnicode_2BYTE_KIND:
10248 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010249 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 PyUnicode_GET_LENGTH(self), maxcount
10251 );
10252 case PyUnicode_4BYTE_KIND:
10253 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010254 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 PyUnicode_GET_LENGTH(self), maxcount
10256 );
10257 default:
10258 assert(0);
10259 return NULL;
10260 }
10261
10262 if (PyUnicode_READY(substring) == -1)
10263 return NULL;
10264
10265 kind1 = PyUnicode_KIND(self);
10266 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 len1 = PyUnicode_GET_LENGTH(self);
10268 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010269 if (kind1 < kind2 || len1 < len2) {
10270 out = PyList_New(1);
10271 if (out == NULL)
10272 return NULL;
10273 Py_INCREF(self);
10274 PyList_SET_ITEM(out, 0, self);
10275 return out;
10276 }
10277 buf1 = PyUnicode_DATA(self);
10278 buf2 = PyUnicode_DATA(substring);
10279 if (kind2 != kind1) {
10280 buf2 = _PyUnicode_AsKind(substring, kind1);
10281 if (!buf2)
10282 return NULL;
10283 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010285 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010287 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10288 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010289 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010290 else
10291 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010292 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 break;
10294 case PyUnicode_2BYTE_KIND:
10295 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010296 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 break;
10298 case PyUnicode_4BYTE_KIND:
10299 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010300 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 break;
10302 default:
10303 out = NULL;
10304 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010305 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 PyMem_Free(buf2);
10307 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308}
10309
Alexander Belopolsky40018472011-02-26 01:02:56 +000010310static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010311rsplit(PyObject *self,
10312 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010313 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010314{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010315 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 void *buf1, *buf2;
10317 Py_ssize_t len1, len2;
10318 PyObject* out;
10319
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010320 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010321 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 if (PyUnicode_READY(self) == -1)
10324 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010327 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010329 if (PyUnicode_IS_ASCII(self))
10330 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010331 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010332 PyUnicode_GET_LENGTH(self), maxcount
10333 );
10334 else
10335 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010336 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010337 PyUnicode_GET_LENGTH(self), maxcount
10338 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 case PyUnicode_2BYTE_KIND:
10340 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010341 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 PyUnicode_GET_LENGTH(self), maxcount
10343 );
10344 case PyUnicode_4BYTE_KIND:
10345 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010346 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 PyUnicode_GET_LENGTH(self), maxcount
10348 );
10349 default:
10350 assert(0);
10351 return NULL;
10352 }
10353
10354 if (PyUnicode_READY(substring) == -1)
10355 return NULL;
10356
10357 kind1 = PyUnicode_KIND(self);
10358 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 len1 = PyUnicode_GET_LENGTH(self);
10360 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010361 if (kind1 < kind2 || len1 < len2) {
10362 out = PyList_New(1);
10363 if (out == NULL)
10364 return NULL;
10365 Py_INCREF(self);
10366 PyList_SET_ITEM(out, 0, self);
10367 return out;
10368 }
10369 buf1 = PyUnicode_DATA(self);
10370 buf2 = PyUnicode_DATA(substring);
10371 if (kind2 != kind1) {
10372 buf2 = _PyUnicode_AsKind(substring, kind1);
10373 if (!buf2)
10374 return NULL;
10375 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010377 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010379 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10380 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010381 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010382 else
10383 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010384 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 break;
10386 case PyUnicode_2BYTE_KIND:
10387 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010388 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 break;
10390 case PyUnicode_4BYTE_KIND:
10391 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010392 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 break;
10394 default:
10395 out = NULL;
10396 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010397 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 PyMem_Free(buf2);
10399 return out;
10400}
10401
10402static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010403anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10404 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010406 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010408 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10409 return asciilib_find(buf1, len1, buf2, len2, offset);
10410 else
10411 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 case PyUnicode_2BYTE_KIND:
10413 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10414 case PyUnicode_4BYTE_KIND:
10415 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10416 }
10417 assert(0);
10418 return -1;
10419}
10420
10421static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010422anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10423 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010425 switch (kind) {
10426 case PyUnicode_1BYTE_KIND:
10427 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10428 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10429 else
10430 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10431 case PyUnicode_2BYTE_KIND:
10432 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10433 case PyUnicode_4BYTE_KIND:
10434 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10435 }
10436 assert(0);
10437 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010438}
10439
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010440static void
10441replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10442 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10443{
10444 int kind = PyUnicode_KIND(u);
10445 void *data = PyUnicode_DATA(u);
10446 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10447 if (kind == PyUnicode_1BYTE_KIND) {
10448 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10449 (Py_UCS1 *)data + len,
10450 u1, u2, maxcount);
10451 }
10452 else if (kind == PyUnicode_2BYTE_KIND) {
10453 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10454 (Py_UCS2 *)data + len,
10455 u1, u2, maxcount);
10456 }
10457 else {
10458 assert(kind == PyUnicode_4BYTE_KIND);
10459 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10460 (Py_UCS4 *)data + len,
10461 u1, u2, maxcount);
10462 }
10463}
10464
Alexander Belopolsky40018472011-02-26 01:02:56 +000010465static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466replace(PyObject *self, PyObject *str1,
10467 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010468{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 PyObject *u;
10470 char *sbuf = PyUnicode_DATA(self);
10471 char *buf1 = PyUnicode_DATA(str1);
10472 char *buf2 = PyUnicode_DATA(str2);
10473 int srelease = 0, release1 = 0, release2 = 0;
10474 int skind = PyUnicode_KIND(self);
10475 int kind1 = PyUnicode_KIND(str1);
10476 int kind2 = PyUnicode_KIND(str2);
10477 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10478 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10479 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010480 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010481 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010482
10483 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010484 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010486 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487
Victor Stinner59de0ee2011-10-07 10:01:28 +020010488 if (str1 == str2)
10489 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490
Victor Stinner49a0a212011-10-12 23:46:10 +020010491 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010492 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10493 if (maxchar < maxchar_str1)
10494 /* substring too wide to be present */
10495 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010496 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10497 /* Replacing str1 with str2 may cause a maxchar reduction in the
10498 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010499 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010500 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010503 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010505 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010507 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010508 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010509 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010510
Victor Stinner69ed0f42013-04-09 21:48:24 +020010511 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010512 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010513 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010514 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010515 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010517 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010519
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010520 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10521 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010522 }
10523 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 int rkind = skind;
10525 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010526 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 if (kind1 < rkind) {
10529 /* widen substring */
10530 buf1 = _PyUnicode_AsKind(str1, rkind);
10531 if (!buf1) goto error;
10532 release1 = 1;
10533 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010534 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010535 if (i < 0)
10536 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 if (rkind > kind2) {
10538 /* widen replacement */
10539 buf2 = _PyUnicode_AsKind(str2, rkind);
10540 if (!buf2) goto error;
10541 release2 = 1;
10542 }
10543 else if (rkind < kind2) {
10544 /* widen self and buf1 */
10545 rkind = kind2;
10546 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010547 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 sbuf = _PyUnicode_AsKind(self, rkind);
10549 if (!sbuf) goto error;
10550 srelease = 1;
10551 buf1 = _PyUnicode_AsKind(str1, rkind);
10552 if (!buf1) goto error;
10553 release1 = 1;
10554 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010555 u = PyUnicode_New(slen, maxchar);
10556 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010558 assert(PyUnicode_KIND(u) == rkind);
10559 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010560
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010561 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010562 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010563 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010565 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010567
10568 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010569 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010570 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010571 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010572 if (i == -1)
10573 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010574 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010576 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010579 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010580 }
10581 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010583 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 int rkind = skind;
10585 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010588 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 buf1 = _PyUnicode_AsKind(str1, rkind);
10590 if (!buf1) goto error;
10591 release1 = 1;
10592 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010593 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010594 if (n == 0)
10595 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010597 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 buf2 = _PyUnicode_AsKind(str2, rkind);
10599 if (!buf2) goto error;
10600 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010603 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 rkind = kind2;
10605 sbuf = _PyUnicode_AsKind(self, rkind);
10606 if (!sbuf) goto error;
10607 srelease = 1;
10608 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010609 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 buf1 = _PyUnicode_AsKind(str1, rkind);
10611 if (!buf1) goto error;
10612 release1 = 1;
10613 }
10614 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10615 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010616 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 PyErr_SetString(PyExc_OverflowError,
10618 "replace string is too long");
10619 goto error;
10620 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010621 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010622 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010623 _Py_INCREF_UNICODE_EMPTY();
10624 if (!unicode_empty)
10625 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010626 u = unicode_empty;
10627 goto done;
10628 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010629 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 PyErr_SetString(PyExc_OverflowError,
10631 "replace string is too long");
10632 goto error;
10633 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010634 u = PyUnicode_New(new_size, maxchar);
10635 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010637 assert(PyUnicode_KIND(u) == rkind);
10638 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 ires = i = 0;
10640 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010641 while (n-- > 0) {
10642 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010643 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010644 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010645 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010646 if (j == -1)
10647 break;
10648 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010649 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010650 memcpy(res + rkind * ires,
10651 sbuf + rkind * i,
10652 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010654 }
10655 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010657 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010659 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010661 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010665 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010666 memcpy(res + rkind * ires,
10667 sbuf + rkind * i,
10668 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010669 }
10670 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010671 /* interleave */
10672 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010673 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010675 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010677 if (--n <= 0)
10678 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010679 memcpy(res + rkind * ires,
10680 sbuf + rkind * i,
10681 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 ires++;
10683 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010684 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010685 memcpy(res + rkind * ires,
10686 sbuf + rkind * i,
10687 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010688 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010689 }
10690
10691 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010692 unicode_adjust_maxchar(&u);
10693 if (u == NULL)
10694 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010696
10697 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 if (srelease)
10699 PyMem_FREE(sbuf);
10700 if (release1)
10701 PyMem_FREE(buf1);
10702 if (release2)
10703 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010704 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010706
Benjamin Peterson29060642009-01-31 22:14:21 +000010707 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010708 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 if (srelease)
10710 PyMem_FREE(sbuf);
10711 if (release1)
10712 PyMem_FREE(buf1);
10713 if (release2)
10714 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010715 return unicode_result_unchanged(self);
10716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 error:
10718 if (srelease && sbuf)
10719 PyMem_FREE(sbuf);
10720 if (release1 && buf1)
10721 PyMem_FREE(buf1);
10722 if (release2 && buf2)
10723 PyMem_FREE(buf2);
10724 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010725}
10726
10727/* --- Unicode Object Methods --------------------------------------------- */
10728
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010729PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010730 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731\n\
10732Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010733characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734
10735static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010736unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010738 if (PyUnicode_READY(self) == -1)
10739 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010740 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741}
10742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010743PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010744 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745\n\
10746Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010747have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748
10749static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010750unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010752 if (PyUnicode_READY(self) == -1)
10753 return NULL;
10754 if (PyUnicode_GET_LENGTH(self) == 0)
10755 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010756 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757}
10758
Benjamin Petersond5890c82012-01-14 13:23:30 -050010759PyDoc_STRVAR(casefold__doc__,
10760 "S.casefold() -> str\n\
10761\n\
10762Return a version of S suitable for caseless comparisons.");
10763
10764static PyObject *
10765unicode_casefold(PyObject *self)
10766{
10767 if (PyUnicode_READY(self) == -1)
10768 return NULL;
10769 if (PyUnicode_IS_ASCII(self))
10770 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010771 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010772}
10773
10774
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010775/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010776
10777static int
10778convert_uc(PyObject *obj, void *addr)
10779{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010781
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010782 if (!PyUnicode_Check(obj)) {
10783 PyErr_Format(PyExc_TypeError,
10784 "The fill character must be a unicode character, "
10785 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010786 return 0;
10787 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010788 if (PyUnicode_READY(obj) < 0)
10789 return 0;
10790 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010791 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010792 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010793 return 0;
10794 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010795 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010796 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010797}
10798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010799PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010800 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010801\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010802Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010803done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010804
10805static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010806unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010807{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010808 Py_ssize_t marg, left;
10809 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 Py_UCS4 fillchar = ' ';
10811
Victor Stinnere9a29352011-10-01 02:14:59 +020010812 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010813 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010814
Benjamin Petersonbac79492012-01-14 13:34:47 -050010815 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816 return NULL;
10817
Victor Stinnerc4b49542011-12-11 22:44:26 +010010818 if (PyUnicode_GET_LENGTH(self) >= width)
10819 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820
Victor Stinnerc4b49542011-12-11 22:44:26 +010010821 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822 left = marg / 2 + (marg & width & 1);
10823
Victor Stinner9310abb2011-10-05 00:59:23 +020010824 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825}
10826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827/* This function assumes that str1 and str2 are readied by the caller. */
10828
Marc-André Lemburge5034372000-08-08 08:04:29 +000010829static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010830unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010831{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010832#define COMPARE(TYPE1, TYPE2) \
10833 do { \
10834 TYPE1* p1 = (TYPE1 *)data1; \
10835 TYPE2* p2 = (TYPE2 *)data2; \
10836 TYPE1* end = p1 + len; \
10837 Py_UCS4 c1, c2; \
10838 for (; p1 != end; p1++, p2++) { \
10839 c1 = *p1; \
10840 c2 = *p2; \
10841 if (c1 != c2) \
10842 return (c1 < c2) ? -1 : 1; \
10843 } \
10844 } \
10845 while (0)
10846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010847 int kind1, kind2;
10848 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010849 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010851 kind1 = PyUnicode_KIND(str1);
10852 kind2 = PyUnicode_KIND(str2);
10853 data1 = PyUnicode_DATA(str1);
10854 data2 = PyUnicode_DATA(str2);
10855 len1 = PyUnicode_GET_LENGTH(str1);
10856 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010857 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010858
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010859 switch(kind1) {
10860 case PyUnicode_1BYTE_KIND:
10861 {
10862 switch(kind2) {
10863 case PyUnicode_1BYTE_KIND:
10864 {
10865 int cmp = memcmp(data1, data2, len);
10866 /* normalize result of memcmp() into the range [-1; 1] */
10867 if (cmp < 0)
10868 return -1;
10869 if (cmp > 0)
10870 return 1;
10871 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010872 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010873 case PyUnicode_2BYTE_KIND:
10874 COMPARE(Py_UCS1, Py_UCS2);
10875 break;
10876 case PyUnicode_4BYTE_KIND:
10877 COMPARE(Py_UCS1, Py_UCS4);
10878 break;
10879 default:
10880 assert(0);
10881 }
10882 break;
10883 }
10884 case PyUnicode_2BYTE_KIND:
10885 {
10886 switch(kind2) {
10887 case PyUnicode_1BYTE_KIND:
10888 COMPARE(Py_UCS2, Py_UCS1);
10889 break;
10890 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010891 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010892 COMPARE(Py_UCS2, Py_UCS2);
10893 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010894 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010895 case PyUnicode_4BYTE_KIND:
10896 COMPARE(Py_UCS2, Py_UCS4);
10897 break;
10898 default:
10899 assert(0);
10900 }
10901 break;
10902 }
10903 case PyUnicode_4BYTE_KIND:
10904 {
10905 switch(kind2) {
10906 case PyUnicode_1BYTE_KIND:
10907 COMPARE(Py_UCS4, Py_UCS1);
10908 break;
10909 case PyUnicode_2BYTE_KIND:
10910 COMPARE(Py_UCS4, Py_UCS2);
10911 break;
10912 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010913 {
10914#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10915 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10916 /* normalize result of wmemcmp() into the range [-1; 1] */
10917 if (cmp < 0)
10918 return -1;
10919 if (cmp > 0)
10920 return 1;
10921#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010922 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010923#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010924 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010925 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010926 default:
10927 assert(0);
10928 }
10929 break;
10930 }
10931 default:
10932 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010933 }
10934
Victor Stinner770e19e2012-10-04 22:59:45 +020010935 if (len1 == len2)
10936 return 0;
10937 if (len1 < len2)
10938 return -1;
10939 else
10940 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010941
10942#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010943}
10944
Benjamin Peterson621b4302016-09-09 13:54:34 -070010945static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010946unicode_compare_eq(PyObject *str1, PyObject *str2)
10947{
10948 int kind;
10949 void *data1, *data2;
10950 Py_ssize_t len;
10951 int cmp;
10952
Victor Stinnere5567ad2012-10-23 02:48:49 +020010953 len = PyUnicode_GET_LENGTH(str1);
10954 if (PyUnicode_GET_LENGTH(str2) != len)
10955 return 0;
10956 kind = PyUnicode_KIND(str1);
10957 if (PyUnicode_KIND(str2) != kind)
10958 return 0;
10959 data1 = PyUnicode_DATA(str1);
10960 data2 = PyUnicode_DATA(str2);
10961
10962 cmp = memcmp(data1, data2, len * kind);
10963 return (cmp == 0);
10964}
10965
10966
Alexander Belopolsky40018472011-02-26 01:02:56 +000010967int
10968PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10971 if (PyUnicode_READY(left) == -1 ||
10972 PyUnicode_READY(right) == -1)
10973 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010974
10975 /* a string is equal to itself */
10976 if (left == right)
10977 return 0;
10978
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010979 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010981 PyErr_Format(PyExc_TypeError,
10982 "Can't compare %.100s and %.100s",
10983 left->ob_type->tp_name,
10984 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985 return -1;
10986}
10987
Martin v. Löwis5b222132007-06-10 09:51:05 +000010988int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010989_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10990{
10991 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10992 if (right_str == NULL)
10993 return -1;
10994 return PyUnicode_Compare(left, right_str);
10995}
10996
10997int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010998PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10999{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011000 Py_ssize_t i;
11001 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011002 Py_UCS4 chr;
11003
Victor Stinner910337b2011-10-03 03:20:16 +020011004 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 if (PyUnicode_READY(uni) == -1)
11006 return -1;
11007 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011008 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011009 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011010 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011011 size_t len, len2 = strlen(str);
11012 int cmp;
11013
11014 len = Py_MIN(len1, len2);
11015 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011016 if (cmp != 0) {
11017 if (cmp < 0)
11018 return -1;
11019 else
11020 return 1;
11021 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011022 if (len1 > len2)
11023 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011024 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011025 return -1; /* str is longer */
11026 return 0;
11027 }
11028 else {
11029 void *data = PyUnicode_DATA(uni);
11030 /* Compare Unicode string and source character set string */
11031 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011032 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011033 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11034 /* This check keeps Python strings that end in '\0' from comparing equal
11035 to C strings identical up to that point. */
11036 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11037 return 1; /* uni is longer */
11038 if (str[i])
11039 return -1; /* str is longer */
11040 return 0;
11041 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011042}
11043
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011044
Benjamin Peterson29060642009-01-31 22:14:21 +000011045#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011046 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011047
Alexander Belopolsky40018472011-02-26 01:02:56 +000011048PyObject *
11049PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011050{
11051 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011052 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011053
Victor Stinnere5567ad2012-10-23 02:48:49 +020011054 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11055 Py_RETURN_NOTIMPLEMENTED;
11056
11057 if (PyUnicode_READY(left) == -1 ||
11058 PyUnicode_READY(right) == -1)
11059 return NULL;
11060
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011061 if (left == right) {
11062 switch (op) {
11063 case Py_EQ:
11064 case Py_LE:
11065 case Py_GE:
11066 /* a string is equal to itself */
11067 v = Py_True;
11068 break;
11069 case Py_NE:
11070 case Py_LT:
11071 case Py_GT:
11072 v = Py_False;
11073 break;
11074 default:
11075 PyErr_BadArgument();
11076 return NULL;
11077 }
11078 }
11079 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011080 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011081 result ^= (op == Py_NE);
11082 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011083 }
11084 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011085 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011086
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011087 /* Convert the return value to a Boolean */
11088 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011089 case Py_LE:
11090 v = TEST_COND(result <= 0);
11091 break;
11092 case Py_GE:
11093 v = TEST_COND(result >= 0);
11094 break;
11095 case Py_LT:
11096 v = TEST_COND(result == -1);
11097 break;
11098 case Py_GT:
11099 v = TEST_COND(result == 1);
11100 break;
11101 default:
11102 PyErr_BadArgument();
11103 return NULL;
11104 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011105 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011106 Py_INCREF(v);
11107 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011108}
11109
Alexander Belopolsky40018472011-02-26 01:02:56 +000011110int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011111_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11112{
11113 return unicode_eq(aa, bb);
11114}
11115
11116int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011117PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011118{
Victor Stinner77282cb2013-04-14 19:22:47 +020011119 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011120 void *buf1, *buf2;
11121 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011122 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011123
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011124 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011125 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011126 "'in <string>' requires string as left operand, not %.100s",
11127 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011128 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011129 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011130 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011131 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011132 if (ensure_unicode(str) < 0)
11133 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011135 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011136 kind2 = PyUnicode_KIND(substr);
11137 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011138 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011140 len2 = PyUnicode_GET_LENGTH(substr);
11141 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011142 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011143 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011144 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011145 if (len2 == 1) {
11146 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11147 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011148 return result;
11149 }
11150 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011151 buf2 = _PyUnicode_AsKind(substr, kind1);
11152 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011153 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011154 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155
Victor Stinner77282cb2013-04-14 19:22:47 +020011156 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 case PyUnicode_1BYTE_KIND:
11158 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11159 break;
11160 case PyUnicode_2BYTE_KIND:
11161 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11162 break;
11163 case PyUnicode_4BYTE_KIND:
11164 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11165 break;
11166 default:
11167 result = -1;
11168 assert(0);
11169 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011170
Victor Stinner77282cb2013-04-14 19:22:47 +020011171 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 PyMem_Free(buf2);
11173
Guido van Rossum403d68b2000-03-13 15:55:09 +000011174 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011175}
11176
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177/* Concat to string or Unicode object giving a new Unicode object. */
11178
Alexander Belopolsky40018472011-02-26 01:02:56 +000011179PyObject *
11180PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011182 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011183 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011184 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011186 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11187 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188
11189 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011190 if (left == unicode_empty)
11191 return PyUnicode_FromObject(right);
11192 if (right == unicode_empty)
11193 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011195 left_len = PyUnicode_GET_LENGTH(left);
11196 right_len = PyUnicode_GET_LENGTH(right);
11197 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011198 PyErr_SetString(PyExc_OverflowError,
11199 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011200 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011201 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011202 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011203
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011204 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11205 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011206 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011209 result = PyUnicode_New(new_len, maxchar);
11210 if (result == NULL)
11211 return NULL;
11212 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11213 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11214 assert(_PyUnicode_CheckConsistency(result, 1));
11215 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216}
11217
Walter Dörwald1ab83302007-05-18 17:15:44 +000011218void
Victor Stinner23e56682011-10-03 03:54:37 +020011219PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011220{
Victor Stinner23e56682011-10-03 03:54:37 +020011221 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011222 Py_UCS4 maxchar, maxchar2;
11223 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011224
11225 if (p_left == NULL) {
11226 if (!PyErr_Occurred())
11227 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011228 return;
11229 }
Victor Stinner23e56682011-10-03 03:54:37 +020011230 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011231 if (right == NULL || left == NULL
11232 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011233 if (!PyErr_Occurred())
11234 PyErr_BadInternalCall();
11235 goto error;
11236 }
11237
Benjamin Petersonbac79492012-01-14 13:34:47 -050011238 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011239 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011240 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011241 goto error;
11242
Victor Stinner488fa492011-12-12 00:01:39 +010011243 /* Shortcuts */
11244 if (left == unicode_empty) {
11245 Py_DECREF(left);
11246 Py_INCREF(right);
11247 *p_left = right;
11248 return;
11249 }
11250 if (right == unicode_empty)
11251 return;
11252
11253 left_len = PyUnicode_GET_LENGTH(left);
11254 right_len = PyUnicode_GET_LENGTH(right);
11255 if (left_len > PY_SSIZE_T_MAX - right_len) {
11256 PyErr_SetString(PyExc_OverflowError,
11257 "strings are too large to concat");
11258 goto error;
11259 }
11260 new_len = left_len + right_len;
11261
11262 if (unicode_modifiable(left)
11263 && PyUnicode_CheckExact(right)
11264 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011265 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11266 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011267 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011268 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011269 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11270 {
11271 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011272 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011273 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011274
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011275 /* copy 'right' into the newly allocated area of 'left' */
11276 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011277 }
Victor Stinner488fa492011-12-12 00:01:39 +010011278 else {
11279 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11280 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011281 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011282
Victor Stinner488fa492011-12-12 00:01:39 +010011283 /* Concat the two Unicode strings */
11284 res = PyUnicode_New(new_len, maxchar);
11285 if (res == NULL)
11286 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011287 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11288 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011289 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011290 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011291 }
11292 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011293 return;
11294
11295error:
Victor Stinner488fa492011-12-12 00:01:39 +010011296 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011297}
11298
11299void
11300PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11301{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011302 PyUnicode_Append(pleft, right);
11303 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011304}
11305
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011306/*
11307Wraps stringlib_parse_args_finds() and additionally ensures that the
11308first argument is a unicode object.
11309*/
11310
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011311static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011312parse_args_finds_unicode(const char * function_name, PyObject *args,
11313 PyObject **substring,
11314 Py_ssize_t *start, Py_ssize_t *end)
11315{
11316 if(stringlib_parse_args_finds(function_name, args, substring,
11317 start, end)) {
11318 if (ensure_unicode(*substring) < 0)
11319 return 0;
11320 return 1;
11321 }
11322 return 0;
11323}
11324
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011325PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011326 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011328Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011329string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011330interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331
11332static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011333unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011335 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011336 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011337 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011339 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340 void *buf1, *buf2;
11341 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011343 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011344 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011346 kind1 = PyUnicode_KIND(self);
11347 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011348 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011349 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011351 len1 = PyUnicode_GET_LENGTH(self);
11352 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011354 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011355 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011356
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011357 buf1 = PyUnicode_DATA(self);
11358 buf2 = PyUnicode_DATA(substring);
11359 if (kind2 != kind1) {
11360 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011361 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011362 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011363 }
11364 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365 case PyUnicode_1BYTE_KIND:
11366 iresult = ucs1lib_count(
11367 ((Py_UCS1*)buf1) + start, end - start,
11368 buf2, len2, PY_SSIZE_T_MAX
11369 );
11370 break;
11371 case PyUnicode_2BYTE_KIND:
11372 iresult = ucs2lib_count(
11373 ((Py_UCS2*)buf1) + start, end - start,
11374 buf2, len2, PY_SSIZE_T_MAX
11375 );
11376 break;
11377 case PyUnicode_4BYTE_KIND:
11378 iresult = ucs4lib_count(
11379 ((Py_UCS4*)buf1) + start, end - start,
11380 buf2, len2, PY_SSIZE_T_MAX
11381 );
11382 break;
11383 default:
11384 assert(0); iresult = 0;
11385 }
11386
11387 result = PyLong_FromSsize_t(iresult);
11388
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011389 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011390 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392 return result;
11393}
11394
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011395PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011396 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011398Encode S using the codec registered for encoding. Default encoding\n\
11399is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011400handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011401a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11402'xmlcharrefreplace' as well as any other name registered with\n\
11403codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404
11405static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011406unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011408 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409 char *encoding = NULL;
11410 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011411
Benjamin Peterson308d6372009-09-18 21:42:35 +000011412 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11413 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011415 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011416}
11417
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011418PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011419 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420\n\
11421Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011422If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423
11424static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011425unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011427 Py_ssize_t i, j, line_pos, src_len, incr;
11428 Py_UCS4 ch;
11429 PyObject *u;
11430 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011431 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011433 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011434 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435
Ezio Melotti745d54d2013-11-16 19:10:57 +020011436 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11437 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011438 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439
Antoine Pitrou22425222011-10-04 19:10:51 +020011440 if (PyUnicode_READY(self) == -1)
11441 return NULL;
11442
Thomas Wouters7e474022000-07-16 12:04:32 +000011443 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011444 src_len = PyUnicode_GET_LENGTH(self);
11445 i = j = line_pos = 0;
11446 kind = PyUnicode_KIND(self);
11447 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011448 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011449 for (; i < src_len; i++) {
11450 ch = PyUnicode_READ(kind, src_data, i);
11451 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011452 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011453 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011454 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011455 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011456 goto overflow;
11457 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011458 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011459 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011460 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011462 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011463 goto overflow;
11464 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011466 if (ch == '\n' || ch == '\r')
11467 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011469 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011470 if (!found)
11471 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011472
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011474 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475 if (!u)
11476 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011477 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478
Antoine Pitroue71d5742011-10-04 15:55:09 +020011479 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480
Antoine Pitroue71d5742011-10-04 15:55:09 +020011481 for (; i < src_len; i++) {
11482 ch = PyUnicode_READ(kind, src_data, i);
11483 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011484 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011485 incr = tabsize - (line_pos % tabsize);
11486 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011487 FILL(kind, dest_data, ' ', j, incr);
11488 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011489 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011490 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011492 line_pos++;
11493 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011494 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011495 if (ch == '\n' || ch == '\r')
11496 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011498 }
11499 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011500 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011501
Antoine Pitroue71d5742011-10-04 15:55:09 +020011502 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011503 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11504 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505}
11506
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011507PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011508 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509\n\
11510Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011511such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512arguments start and end are interpreted as in slice notation.\n\
11513\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011514Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515
11516static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011519 /* initialize variables to prevent gcc warning */
11520 PyObject *substring = NULL;
11521 Py_ssize_t start = 0;
11522 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011523 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011525 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011528 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011530
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011531 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 if (result == -2)
11534 return NULL;
11535
Christian Heimes217cfd12007-12-02 14:31:20 +000011536 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537}
11538
11539static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011540unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011542 void *data;
11543 enum PyUnicode_Kind kind;
11544 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011545
11546 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11547 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011549 }
11550 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11551 PyErr_SetString(PyExc_IndexError, "string index out of range");
11552 return NULL;
11553 }
11554 kind = PyUnicode_KIND(self);
11555 data = PyUnicode_DATA(self);
11556 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011557 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558}
11559
Guido van Rossumc2504932007-09-18 19:42:40 +000011560/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011561 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011562static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011563unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564{
Guido van Rossumc2504932007-09-18 19:42:40 +000011565 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011566 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011567
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011568#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011569 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011570#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 if (_PyUnicode_HASH(self) != -1)
11572 return _PyUnicode_HASH(self);
11573 if (PyUnicode_READY(self) == -1)
11574 return -1;
11575 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011576 /*
11577 We make the hash of the empty string be 0, rather than using
11578 (prefix ^ suffix), since this slightly obfuscates the hash secret
11579 */
11580 if (len == 0) {
11581 _PyUnicode_HASH(self) = 0;
11582 return 0;
11583 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011584 x = _Py_HashBytes(PyUnicode_DATA(self),
11585 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011587 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588}
11589
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011590PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011591 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011593Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594
11595static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011598 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011599 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011600 PyObject *substring = NULL;
11601 Py_ssize_t start = 0;
11602 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011604 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011607 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011610 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011612 if (result == -2)
11613 return NULL;
11614
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615 if (result < 0) {
11616 PyErr_SetString(PyExc_ValueError, "substring not found");
11617 return NULL;
11618 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011619
Christian Heimes217cfd12007-12-02 14:31:20 +000011620 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621}
11622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011623PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011626Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011627at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628
11629static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011630unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 Py_ssize_t i, length;
11633 int kind;
11634 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635 int cased;
11636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637 if (PyUnicode_READY(self) == -1)
11638 return NULL;
11639 length = PyUnicode_GET_LENGTH(self);
11640 kind = PyUnicode_KIND(self);
11641 data = PyUnicode_DATA(self);
11642
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644 if (length == 1)
11645 return PyBool_FromLong(
11646 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011648 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011650 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011651
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011653 for (i = 0; i < length; i++) {
11654 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011655
Benjamin Peterson29060642009-01-31 22:14:21 +000011656 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11657 return PyBool_FromLong(0);
11658 else if (!cased && Py_UNICODE_ISLOWER(ch))
11659 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011661 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662}
11663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011664PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011665 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011667Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011668at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669
11670static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011671unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 Py_ssize_t i, length;
11674 int kind;
11675 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676 int cased;
11677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 if (PyUnicode_READY(self) == -1)
11679 return NULL;
11680 length = PyUnicode_GET_LENGTH(self);
11681 kind = PyUnicode_KIND(self);
11682 data = PyUnicode_DATA(self);
11683
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 if (length == 1)
11686 return PyBool_FromLong(
11687 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011689 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011691 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011692
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 for (i = 0; i < length; i++) {
11695 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011696
Benjamin Peterson29060642009-01-31 22:14:21 +000011697 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11698 return PyBool_FromLong(0);
11699 else if (!cased && Py_UNICODE_ISUPPER(ch))
11700 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011702 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703}
11704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011705PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011706 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011708Return True if S is a titlecased string and there is at least one\n\
11709character in S, i.e. upper- and titlecase characters may only\n\
11710follow uncased characters and lowercase characters only cased ones.\n\
11711Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712
11713static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011714unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 Py_ssize_t i, length;
11717 int kind;
11718 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719 int cased, previous_is_cased;
11720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 if (PyUnicode_READY(self) == -1)
11722 return NULL;
11723 length = PyUnicode_GET_LENGTH(self);
11724 kind = PyUnicode_KIND(self);
11725 data = PyUnicode_DATA(self);
11726
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 if (length == 1) {
11729 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11730 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11731 (Py_UNICODE_ISUPPER(ch) != 0));
11732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011734 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011735 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011736 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011737
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738 cased = 0;
11739 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740 for (i = 0; i < length; i++) {
11741 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011742
Benjamin Peterson29060642009-01-31 22:14:21 +000011743 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11744 if (previous_is_cased)
11745 return PyBool_FromLong(0);
11746 previous_is_cased = 1;
11747 cased = 1;
11748 }
11749 else if (Py_UNICODE_ISLOWER(ch)) {
11750 if (!previous_is_cased)
11751 return PyBool_FromLong(0);
11752 previous_is_cased = 1;
11753 cased = 1;
11754 }
11755 else
11756 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011758 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759}
11760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011761PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011762 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011764Return True if all characters in S are whitespace\n\
11765and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766
11767static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011768unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 Py_ssize_t i, length;
11771 int kind;
11772 void *data;
11773
11774 if (PyUnicode_READY(self) == -1)
11775 return NULL;
11776 length = PyUnicode_GET_LENGTH(self);
11777 kind = PyUnicode_KIND(self);
11778 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 if (length == 1)
11782 return PyBool_FromLong(
11783 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011785 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011787 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 for (i = 0; i < length; i++) {
11790 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011791 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011792 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011794 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795}
11796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011797PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011798 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011799\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011800Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011801and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011802
11803static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011804unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 Py_ssize_t i, length;
11807 int kind;
11808 void *data;
11809
11810 if (PyUnicode_READY(self) == -1)
11811 return NULL;
11812 length = PyUnicode_GET_LENGTH(self);
11813 kind = PyUnicode_KIND(self);
11814 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011815
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011816 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 if (length == 1)
11818 return PyBool_FromLong(
11819 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011820
11821 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011823 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 for (i = 0; i < length; i++) {
11826 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011827 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011828 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011829 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011830}
11831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011832PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011834\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011835Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011836and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011837
11838static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011839unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011840{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 int kind;
11842 void *data;
11843 Py_ssize_t len, i;
11844
11845 if (PyUnicode_READY(self) == -1)
11846 return NULL;
11847
11848 kind = PyUnicode_KIND(self);
11849 data = PyUnicode_DATA(self);
11850 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011851
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011852 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 if (len == 1) {
11854 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11855 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11856 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011857
11858 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011860 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 for (i = 0; i < len; i++) {
11863 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011864 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011865 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011866 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011867 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011868}
11869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011870PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011871 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011873Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011874False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875
11876static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011877unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879 Py_ssize_t i, length;
11880 int kind;
11881 void *data;
11882
11883 if (PyUnicode_READY(self) == -1)
11884 return NULL;
11885 length = PyUnicode_GET_LENGTH(self);
11886 kind = PyUnicode_KIND(self);
11887 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 if (length == 1)
11891 return PyBool_FromLong(
11892 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011894 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011896 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 for (i = 0; i < length; i++) {
11899 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011900 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011902 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903}
11904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011905PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011908Return True if all characters in S are digits\n\
11909and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910
11911static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011912unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 Py_ssize_t i, length;
11915 int kind;
11916 void *data;
11917
11918 if (PyUnicode_READY(self) == -1)
11919 return NULL;
11920 length = PyUnicode_GET_LENGTH(self);
11921 kind = PyUnicode_KIND(self);
11922 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 if (length == 1) {
11926 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11927 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11928 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011930 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011932 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 for (i = 0; i < length; i++) {
11935 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011936 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011938 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939}
11940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011941PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011942 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011944Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011945False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946
11947static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011948unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 Py_ssize_t i, length;
11951 int kind;
11952 void *data;
11953
11954 if (PyUnicode_READY(self) == -1)
11955 return NULL;
11956 length = PyUnicode_GET_LENGTH(self);
11957 kind = PyUnicode_KIND(self);
11958 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961 if (length == 1)
11962 return PyBool_FromLong(
11963 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011965 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011967 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 for (i = 0; i < length; i++) {
11970 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011971 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011973 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974}
11975
Martin v. Löwis47383402007-08-15 07:32:56 +000011976int
11977PyUnicode_IsIdentifier(PyObject *self)
11978{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 int kind;
11980 void *data;
11981 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011982 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 if (PyUnicode_READY(self) == -1) {
11985 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987 }
11988
11989 /* Special case for empty strings */
11990 if (PyUnicode_GET_LENGTH(self) == 0)
11991 return 0;
11992 kind = PyUnicode_KIND(self);
11993 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011994
11995 /* PEP 3131 says that the first character must be in
11996 XID_Start and subsequent characters in XID_Continue,
11997 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011998 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011999 letters, digits, underscore). However, given the current
12000 definition of XID_Start and XID_Continue, it is sufficient
12001 to check just for these, except that _ must be allowed
12002 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012004 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012005 return 0;
12006
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012007 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012009 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012010 return 1;
12011}
12012
12013PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012014 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000012015\n\
12016Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012017to the language definition.\n\
12018\n\
12019Use keyword.iskeyword() to test for reserved identifiers\n\
12020such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012021
12022static PyObject*
12023unicode_isidentifier(PyObject *self)
12024{
12025 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12026}
12027
Georg Brandl559e5d72008-06-11 18:37:52 +000012028PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012029 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012030\n\
12031Return True if all characters in S are considered\n\
12032printable in repr() or S is empty, False otherwise.");
12033
12034static PyObject*
12035unicode_isprintable(PyObject *self)
12036{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 Py_ssize_t i, length;
12038 int kind;
12039 void *data;
12040
12041 if (PyUnicode_READY(self) == -1)
12042 return NULL;
12043 length = PyUnicode_GET_LENGTH(self);
12044 kind = PyUnicode_KIND(self);
12045 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012046
12047 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 if (length == 1)
12049 return PyBool_FromLong(
12050 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 for (i = 0; i < length; i++) {
12053 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012054 Py_RETURN_FALSE;
12055 }
12056 }
12057 Py_RETURN_TRUE;
12058}
12059
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012060PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012061 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062\n\
12063Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012064iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065
12066static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012067unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012069 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070}
12071
Martin v. Löwis18e16552006-02-15 17:27:45 +000012072static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012073unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012075 if (PyUnicode_READY(self) == -1)
12076 return -1;
12077 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078}
12079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012080PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012081 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012083Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012084done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085
12086static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012087unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012089 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 Py_UCS4 fillchar = ' ';
12091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012092 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093 return NULL;
12094
Benjamin Petersonbac79492012-01-14 13:34:47 -050012095 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012096 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097
Victor Stinnerc4b49542011-12-11 22:44:26 +010012098 if (PyUnicode_GET_LENGTH(self) >= width)
12099 return unicode_result_unchanged(self);
12100
12101 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102}
12103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012104PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012105 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012107Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108
12109static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012110unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012112 if (PyUnicode_READY(self) == -1)
12113 return NULL;
12114 if (PyUnicode_IS_ASCII(self))
12115 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012116 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117}
12118
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012119#define LEFTSTRIP 0
12120#define RIGHTSTRIP 1
12121#define BOTHSTRIP 2
12122
12123/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012124static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012125
12126#define STRIPNAME(i) (stripformat[i]+3)
12127
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012128/* externally visible for str.strip(unicode) */
12129PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012130_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012131{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 void *data;
12133 int kind;
12134 Py_ssize_t i, j, len;
12135 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012136 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12139 return NULL;
12140
12141 kind = PyUnicode_KIND(self);
12142 data = PyUnicode_DATA(self);
12143 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012144 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12146 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012147 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012148
Benjamin Peterson14339b62009-01-31 16:36:08 +000012149 i = 0;
12150 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012151 while (i < len) {
12152 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12153 if (!BLOOM(sepmask, ch))
12154 break;
12155 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12156 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012157 i++;
12158 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012159 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012160
Benjamin Peterson14339b62009-01-31 16:36:08 +000012161 j = len;
12162 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012163 j--;
12164 while (j >= i) {
12165 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12166 if (!BLOOM(sepmask, ch))
12167 break;
12168 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12169 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012170 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012171 }
12172
Benjamin Peterson29060642009-01-31 22:14:21 +000012173 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012174 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012175
Victor Stinner7931d9a2011-11-04 00:22:48 +010012176 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177}
12178
12179PyObject*
12180PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12181{
12182 unsigned char *data;
12183 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012184 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185
Victor Stinnerde636f32011-10-01 03:55:54 +020012186 if (PyUnicode_READY(self) == -1)
12187 return NULL;
12188
Victor Stinner684d5fd2012-05-03 02:32:34 +020012189 length = PyUnicode_GET_LENGTH(self);
12190 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012191
Victor Stinner684d5fd2012-05-03 02:32:34 +020012192 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012193 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194
Victor Stinnerde636f32011-10-01 03:55:54 +020012195 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012196 PyErr_SetString(PyExc_IndexError, "string index out of range");
12197 return NULL;
12198 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012199 if (start >= length || end < start)
12200 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012201
Victor Stinner684d5fd2012-05-03 02:32:34 +020012202 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012203 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012204 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012205 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012206 }
12207 else {
12208 kind = PyUnicode_KIND(self);
12209 data = PyUnicode_1BYTE_DATA(self);
12210 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012211 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012212 length);
12213 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215
12216static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012217do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 Py_ssize_t len, i, j;
12220
12221 if (PyUnicode_READY(self) == -1)
12222 return NULL;
12223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012225
Victor Stinnercc7af722013-04-09 22:39:24 +020012226 if (PyUnicode_IS_ASCII(self)) {
12227 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12228
12229 i = 0;
12230 if (striptype != RIGHTSTRIP) {
12231 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012232 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012233 if (!_Py_ascii_whitespace[ch])
12234 break;
12235 i++;
12236 }
12237 }
12238
12239 j = len;
12240 if (striptype != LEFTSTRIP) {
12241 j--;
12242 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012243 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012244 if (!_Py_ascii_whitespace[ch])
12245 break;
12246 j--;
12247 }
12248 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012249 }
12250 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012251 else {
12252 int kind = PyUnicode_KIND(self);
12253 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012254
Victor Stinnercc7af722013-04-09 22:39:24 +020012255 i = 0;
12256 if (striptype != RIGHTSTRIP) {
12257 while (i < len) {
12258 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12259 if (!Py_UNICODE_ISSPACE(ch))
12260 break;
12261 i++;
12262 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012263 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012264
12265 j = len;
12266 if (striptype != LEFTSTRIP) {
12267 j--;
12268 while (j >= i) {
12269 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12270 if (!Py_UNICODE_ISSPACE(ch))
12271 break;
12272 j--;
12273 }
12274 j++;
12275 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012276 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012277
Victor Stinner7931d9a2011-11-04 00:22:48 +010012278 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279}
12280
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012281
12282static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012283do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012284{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012285 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012286
Serhiy Storchakac6792272013-10-19 21:03:34 +030012287 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012288 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012289
Benjamin Peterson14339b62009-01-31 16:36:08 +000012290 if (sep != NULL && sep != Py_None) {
12291 if (PyUnicode_Check(sep))
12292 return _PyUnicode_XStrip(self, striptype, sep);
12293 else {
12294 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012295 "%s arg must be None or str",
12296 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012297 return NULL;
12298 }
12299 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012300
Benjamin Peterson14339b62009-01-31 16:36:08 +000012301 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012302}
12303
12304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012305PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012306 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012307\n\
12308Return a copy of the string S with leading and trailing\n\
12309whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012310If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012311
12312static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012313unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012314{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012315 if (PyTuple_GET_SIZE(args) == 0)
12316 return do_strip(self, BOTHSTRIP); /* Common case */
12317 else
12318 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012319}
12320
12321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012322PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012323 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012324\n\
12325Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012326If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012327
12328static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012329unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012330{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012331 if (PyTuple_GET_SIZE(args) == 0)
12332 return do_strip(self, LEFTSTRIP); /* Common case */
12333 else
12334 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012335}
12336
12337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012338PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012339 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012340\n\
12341Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012342If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012343
12344static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012345unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012346{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012347 if (PyTuple_GET_SIZE(args) == 0)
12348 return do_strip(self, RIGHTSTRIP); /* Common case */
12349 else
12350 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012351}
12352
12353
Guido van Rossumd57fd912000-03-10 22:53:23 +000012354static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012355unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012357 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359
Serhiy Storchaka05997252013-01-26 12:14:02 +020012360 if (len < 1)
12361 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362
Victor Stinnerc4b49542011-12-11 22:44:26 +010012363 /* no repeat, return original string */
12364 if (len == 1)
12365 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012366
Benjamin Petersonbac79492012-01-14 13:34:47 -050012367 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368 return NULL;
12369
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012370 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012371 PyErr_SetString(PyExc_OverflowError,
12372 "repeated string is too long");
12373 return NULL;
12374 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012375 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012376
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012377 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012378 if (!u)
12379 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012380 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 if (PyUnicode_GET_LENGTH(str) == 1) {
12383 const int kind = PyUnicode_KIND(str);
12384 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012385 if (kind == PyUnicode_1BYTE_KIND) {
12386 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012387 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012388 }
12389 else if (kind == PyUnicode_2BYTE_KIND) {
12390 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012391 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012392 ucs2[n] = fill_char;
12393 } else {
12394 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12395 assert(kind == PyUnicode_4BYTE_KIND);
12396 for (n = 0; n < len; ++n)
12397 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012398 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399 }
12400 else {
12401 /* number of characters copied this far */
12402 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012403 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012405 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012406 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012407 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012408 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012409 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012410 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012411 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012412 }
12413
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012414 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012415 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416}
12417
Alexander Belopolsky40018472011-02-26 01:02:56 +000012418PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012419PyUnicode_Replace(PyObject *str,
12420 PyObject *substr,
12421 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012422 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012423{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012424 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12425 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012426 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012427 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012428}
12429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012430PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012431 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012432\n\
12433Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012434old replaced by new. If the optional argument count is\n\
12435given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012436
12437static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012438unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012439{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012440 PyObject *str1;
12441 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012442 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012443
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012444 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012446 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012447 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012448 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012449}
12450
Alexander Belopolsky40018472011-02-26 01:02:56 +000012451static PyObject *
12452unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012453{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012454 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012455 Py_ssize_t isize;
12456 Py_ssize_t osize, squote, dquote, i, o;
12457 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012458 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012461 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012462 return NULL;
12463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464 isize = PyUnicode_GET_LENGTH(unicode);
12465 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 /* Compute length of output, quote characters, and
12468 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012469 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012470 max = 127;
12471 squote = dquote = 0;
12472 ikind = PyUnicode_KIND(unicode);
12473 for (i = 0; i < isize; i++) {
12474 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012475 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012477 case '\'': squote++; break;
12478 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012480 incr = 2;
12481 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 default:
12483 /* Fast-path ASCII */
12484 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012485 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012487 ;
12488 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012489 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012491 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012493 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012495 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012497 if (osize > PY_SSIZE_T_MAX - incr) {
12498 PyErr_SetString(PyExc_OverflowError,
12499 "string is too long to generate repr");
12500 return NULL;
12501 }
12502 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012503 }
12504
12505 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012506 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012508 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 if (dquote)
12510 /* Both squote and dquote present. Use squote,
12511 and escape them */
12512 osize += squote;
12513 else
12514 quote = '"';
12515 }
Victor Stinner55c08782013-04-14 18:45:39 +020012516 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517
12518 repr = PyUnicode_New(osize, max);
12519 if (repr == NULL)
12520 return NULL;
12521 okind = PyUnicode_KIND(repr);
12522 odata = PyUnicode_DATA(repr);
12523
12524 PyUnicode_WRITE(okind, odata, 0, quote);
12525 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012526 if (unchanged) {
12527 _PyUnicode_FastCopyCharacters(repr, 1,
12528 unicode, 0,
12529 isize);
12530 }
12531 else {
12532 for (i = 0, o = 1; i < isize; i++) {
12533 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534
Victor Stinner55c08782013-04-14 18:45:39 +020012535 /* Escape quotes and backslashes */
12536 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012537 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012538 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012539 continue;
12540 }
12541
12542 /* Map special whitespace to '\t', \n', '\r' */
12543 if (ch == '\t') {
12544 PyUnicode_WRITE(okind, odata, o++, '\\');
12545 PyUnicode_WRITE(okind, odata, o++, 't');
12546 }
12547 else if (ch == '\n') {
12548 PyUnicode_WRITE(okind, odata, o++, '\\');
12549 PyUnicode_WRITE(okind, odata, o++, 'n');
12550 }
12551 else if (ch == '\r') {
12552 PyUnicode_WRITE(okind, odata, o++, '\\');
12553 PyUnicode_WRITE(okind, odata, o++, 'r');
12554 }
12555
12556 /* Map non-printable US ASCII to '\xhh' */
12557 else if (ch < ' ' || ch == 0x7F) {
12558 PyUnicode_WRITE(okind, odata, o++, '\\');
12559 PyUnicode_WRITE(okind, odata, o++, 'x');
12560 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12561 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12562 }
12563
12564 /* Copy ASCII characters as-is */
12565 else if (ch < 0x7F) {
12566 PyUnicode_WRITE(okind, odata, o++, ch);
12567 }
12568
12569 /* Non-ASCII characters */
12570 else {
12571 /* Map Unicode whitespace and control characters
12572 (categories Z* and C* except ASCII space)
12573 */
12574 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12575 PyUnicode_WRITE(okind, odata, o++, '\\');
12576 /* Map 8-bit characters to '\xhh' */
12577 if (ch <= 0xff) {
12578 PyUnicode_WRITE(okind, odata, o++, 'x');
12579 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12580 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12581 }
12582 /* Map 16-bit characters to '\uxxxx' */
12583 else if (ch <= 0xffff) {
12584 PyUnicode_WRITE(okind, odata, o++, 'u');
12585 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12586 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12587 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12588 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12589 }
12590 /* Map 21-bit characters to '\U00xxxxxx' */
12591 else {
12592 PyUnicode_WRITE(okind, odata, o++, 'U');
12593 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12594 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12595 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12596 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12597 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12598 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12599 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12600 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12601 }
12602 }
12603 /* Copy characters as-is */
12604 else {
12605 PyUnicode_WRITE(okind, odata, o++, ch);
12606 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012607 }
12608 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012611 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012612 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613}
12614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012615PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012616 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617\n\
12618Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012619such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620arguments start and end are interpreted as in slice notation.\n\
12621\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012622Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623
12624static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012627 /* initialize variables to prevent gcc warning */
12628 PyObject *substring = NULL;
12629 Py_ssize_t start = 0;
12630 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012631 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012633 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012634 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012636 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012639 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 if (result == -2)
12642 return NULL;
12643
Christian Heimes217cfd12007-12-02 14:31:20 +000012644 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012645}
12646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012647PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012648 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012649\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012650Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651
12652static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012654{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012655 /* initialize variables to prevent gcc warning */
12656 PyObject *substring = NULL;
12657 Py_ssize_t start = 0;
12658 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012659 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012661 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012662 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012664 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012667 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 if (result == -2)
12670 return NULL;
12671
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672 if (result < 0) {
12673 PyErr_SetString(PyExc_ValueError, "substring not found");
12674 return NULL;
12675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676
Christian Heimes217cfd12007-12-02 14:31:20 +000012677 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678}
12679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012680PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012681 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012683Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012684done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685
12686static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012687unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012689 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690 Py_UCS4 fillchar = ' ';
12691
Victor Stinnere9a29352011-10-01 02:14:59 +020012692 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012693 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012694
Benjamin Petersonbac79492012-01-14 13:34:47 -050012695 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696 return NULL;
12697
Victor Stinnerc4b49542011-12-11 22:44:26 +010012698 if (PyUnicode_GET_LENGTH(self) >= width)
12699 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700
Victor Stinnerc4b49542011-12-11 22:44:26 +010012701 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012702}
12703
Alexander Belopolsky40018472011-02-26 01:02:56 +000012704PyObject *
12705PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012707 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012708 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012710 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012711}
12712
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012713PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012714 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012715\n\
12716Return a list of the words in S, using sep as the\n\
12717delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012718splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012719whitespace string is a separator and empty strings are\n\
12720removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012721
12722static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012723unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012724{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012725 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012726 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012727 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012728
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012729 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12730 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012731 return NULL;
12732
12733 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012734 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012735
12736 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012737 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012738
12739 PyErr_Format(PyExc_TypeError,
12740 "must be str or None, not %.100s",
12741 Py_TYPE(substring)->tp_name);
12742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743}
12744
Thomas Wouters477c8d52006-05-27 19:21:47 +000012745PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012746PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012747{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012748 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012749 int kind1, kind2;
12750 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012752
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012753 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012754 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012755
Victor Stinner14f8f022011-10-05 20:58:25 +020012756 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758 len1 = PyUnicode_GET_LENGTH(str_obj);
12759 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012760 if (kind1 < kind2 || len1 < len2) {
12761 _Py_INCREF_UNICODE_EMPTY();
12762 if (!unicode_empty)
12763 out = NULL;
12764 else {
12765 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12766 Py_DECREF(unicode_empty);
12767 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012768 return out;
12769 }
12770 buf1 = PyUnicode_DATA(str_obj);
12771 buf2 = PyUnicode_DATA(sep_obj);
12772 if (kind2 != kind1) {
12773 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12774 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012775 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012776 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012777
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012778 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012779 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012780 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12781 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12782 else
12783 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012784 break;
12785 case PyUnicode_2BYTE_KIND:
12786 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12787 break;
12788 case PyUnicode_4BYTE_KIND:
12789 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12790 break;
12791 default:
12792 assert(0);
12793 out = 0;
12794 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012795
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012796 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012797 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012798
12799 return out;
12800}
12801
12802
12803PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012804PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012805{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012806 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012807 int kind1, kind2;
12808 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012809 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012810
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012811 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012812 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012813
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012814 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012815 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012816 len1 = PyUnicode_GET_LENGTH(str_obj);
12817 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012818 if (kind1 < kind2 || len1 < len2) {
12819 _Py_INCREF_UNICODE_EMPTY();
12820 if (!unicode_empty)
12821 out = NULL;
12822 else {
12823 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12824 Py_DECREF(unicode_empty);
12825 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012826 return out;
12827 }
12828 buf1 = PyUnicode_DATA(str_obj);
12829 buf2 = PyUnicode_DATA(sep_obj);
12830 if (kind2 != kind1) {
12831 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12832 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012833 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012834 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012835
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012836 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012837 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012838 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12839 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12840 else
12841 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012842 break;
12843 case PyUnicode_2BYTE_KIND:
12844 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12845 break;
12846 case PyUnicode_4BYTE_KIND:
12847 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12848 break;
12849 default:
12850 assert(0);
12851 out = 0;
12852 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012853
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012854 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012855 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012856
12857 return out;
12858}
12859
12860PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012861 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012862\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012863Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012864the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012865found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012866
12867static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012868unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012869{
Victor Stinner9310abb2011-10-05 00:59:23 +020012870 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012871}
12872
12873PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012874 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012875\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012876Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012877the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012878separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012879
12880static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012881unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012882{
Victor Stinner9310abb2011-10-05 00:59:23 +020012883 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012884}
12885
Alexander Belopolsky40018472011-02-26 01:02:56 +000012886PyObject *
12887PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012888{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012889 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012890 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012891
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012892 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012893}
12894
12895PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012896 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012897\n\
12898Return a list of the words in S, using sep as the\n\
12899delimiter string, starting at the end of the string and\n\
12900working to the front. If maxsplit is given, at most maxsplit\n\
12901splits are done. If sep is not specified, any whitespace string\n\
12902is a separator.");
12903
12904static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012905unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012906{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012907 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012908 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012909 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012910
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012911 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12912 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012913 return NULL;
12914
12915 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012916 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012917
12918 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012919 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012920
12921 PyErr_Format(PyExc_TypeError,
12922 "must be str or None, not %.100s",
12923 Py_TYPE(substring)->tp_name);
12924 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012925}
12926
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012927PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012928 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012929\n\
12930Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012931Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012932is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012933
12934static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012935unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012936{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012937 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012938 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012939
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012940 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12941 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012942 return NULL;
12943
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012944 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012945}
12946
12947static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012948PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012950 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012951}
12952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012953PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012954 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012955\n\
12956Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012957and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958
12959static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012960unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012962 if (PyUnicode_READY(self) == -1)
12963 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012964 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012965}
12966
Larry Hastings61272b72014-01-07 12:41:53 -080012967/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012968
Larry Hastings31826802013-10-19 00:09:25 -070012969@staticmethod
12970str.maketrans as unicode_maketrans
12971
12972 x: object
12973
12974 y: unicode=NULL
12975
12976 z: unicode=NULL
12977
12978 /
12979
12980Return a translation table usable for str.translate().
12981
12982If there is only one argument, it must be a dictionary mapping Unicode
12983ordinals (integers) or characters to Unicode ordinals, strings or None.
12984Character keys will be then converted to ordinals.
12985If there are two arguments, they must be strings of equal length, and
12986in the resulting dictionary, each character in x will be mapped to the
12987character at the same position in y. If there is a third argument, it
12988must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012989[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012990
Larry Hastings31826802013-10-19 00:09:25 -070012991static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012992unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030012993/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012994{
Georg Brandlceee0772007-11-27 23:48:05 +000012995 PyObject *new = NULL, *key, *value;
12996 Py_ssize_t i = 0;
12997 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012998
Georg Brandlceee0772007-11-27 23:48:05 +000012999 new = PyDict_New();
13000 if (!new)
13001 return NULL;
13002 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013003 int x_kind, y_kind, z_kind;
13004 void *x_data, *y_data, *z_data;
13005
Georg Brandlceee0772007-11-27 23:48:05 +000013006 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013007 if (!PyUnicode_Check(x)) {
13008 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13009 "be a string if there is a second argument");
13010 goto err;
13011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013012 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013013 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13014 "arguments must have equal length");
13015 goto err;
13016 }
13017 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013018 x_kind = PyUnicode_KIND(x);
13019 y_kind = PyUnicode_KIND(y);
13020 x_data = PyUnicode_DATA(x);
13021 y_data = PyUnicode_DATA(y);
13022 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13023 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013024 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013025 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013026 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013027 if (!value) {
13028 Py_DECREF(key);
13029 goto err;
13030 }
Georg Brandlceee0772007-11-27 23:48:05 +000013031 res = PyDict_SetItem(new, key, value);
13032 Py_DECREF(key);
13033 Py_DECREF(value);
13034 if (res < 0)
13035 goto err;
13036 }
13037 /* create entries for deleting chars in z */
13038 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013039 z_kind = PyUnicode_KIND(z);
13040 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013041 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013042 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013043 if (!key)
13044 goto err;
13045 res = PyDict_SetItem(new, key, Py_None);
13046 Py_DECREF(key);
13047 if (res < 0)
13048 goto err;
13049 }
13050 }
13051 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013052 int kind;
13053 void *data;
13054
Georg Brandlceee0772007-11-27 23:48:05 +000013055 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013056 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013057 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13058 "to maketrans it must be a dict");
13059 goto err;
13060 }
13061 /* copy entries into the new dict, converting string keys to int keys */
13062 while (PyDict_Next(x, &i, &key, &value)) {
13063 if (PyUnicode_Check(key)) {
13064 /* convert string keys to integer keys */
13065 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013066 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013067 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13068 "table must be of length 1");
13069 goto err;
13070 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013071 kind = PyUnicode_KIND(key);
13072 data = PyUnicode_DATA(key);
13073 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013074 if (!newkey)
13075 goto err;
13076 res = PyDict_SetItem(new, newkey, value);
13077 Py_DECREF(newkey);
13078 if (res < 0)
13079 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013080 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013081 /* just keep integer keys */
13082 if (PyDict_SetItem(new, key, value) < 0)
13083 goto err;
13084 } else {
13085 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13086 "be strings or integers");
13087 goto err;
13088 }
13089 }
13090 }
13091 return new;
13092 err:
13093 Py_DECREF(new);
13094 return NULL;
13095}
13096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013097PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013098 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013099\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013100Return a copy of the string S in which each character has been mapped\n\
13101through the given translation table. The table must implement\n\
13102lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13103mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13104this operation raises LookupError, the character is left untouched.\n\
13105Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106
13107static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013108unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013110 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111}
13112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013113PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013114 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013116Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117
13118static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013119unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013121 if (PyUnicode_READY(self) == -1)
13122 return NULL;
13123 if (PyUnicode_IS_ASCII(self))
13124 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013125 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126}
13127
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013128PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013129 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013131Pad a numeric string S with zeros on the left, to fill a field\n\
13132of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133
13134static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013135unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013137 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013138 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013139 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 int kind;
13141 void *data;
13142 Py_UCS4 chr;
13143
Martin v. Löwis18e16552006-02-15 17:27:45 +000013144 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145 return NULL;
13146
Benjamin Petersonbac79492012-01-14 13:34:47 -050013147 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013148 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013149
Victor Stinnerc4b49542011-12-11 22:44:26 +010013150 if (PyUnicode_GET_LENGTH(self) >= width)
13151 return unicode_result_unchanged(self);
13152
13153 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013154
13155 u = pad(self, fill, 0, '0');
13156
Walter Dörwald068325e2002-04-15 13:36:47 +000013157 if (u == NULL)
13158 return NULL;
13159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013160 kind = PyUnicode_KIND(u);
13161 data = PyUnicode_DATA(u);
13162 chr = PyUnicode_READ(kind, data, fill);
13163
13164 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013165 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013166 PyUnicode_WRITE(kind, data, 0, chr);
13167 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013168 }
13169
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013170 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013171 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013172}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013173
13174#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013175static PyObject *
13176unicode__decimal2ascii(PyObject *self)
13177{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013178 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013179}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013180#endif
13181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013182PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013183 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013184\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013185Return True if S starts with the specified prefix, False otherwise.\n\
13186With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013187With optional end, stop comparing S at that position.\n\
13188prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189
13190static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013191unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013192 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013193{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013194 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013195 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013196 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013197 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013198 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199
Jesus Ceaac451502011-04-20 17:09:23 +020013200 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013201 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013202 if (PyTuple_Check(subobj)) {
13203 Py_ssize_t i;
13204 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013205 substring = PyTuple_GET_ITEM(subobj, i);
13206 if (!PyUnicode_Check(substring)) {
13207 PyErr_Format(PyExc_TypeError,
13208 "tuple for startswith must only contain str, "
13209 "not %.100s",
13210 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013211 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013212 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013213 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013214 if (result == -1)
13215 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013216 if (result) {
13217 Py_RETURN_TRUE;
13218 }
13219 }
13220 /* nothing matched */
13221 Py_RETURN_FALSE;
13222 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013223 if (!PyUnicode_Check(subobj)) {
13224 PyErr_Format(PyExc_TypeError,
13225 "startswith first arg must be str or "
13226 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013227 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013228 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013229 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013230 if (result == -1)
13231 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013232 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013233}
13234
13235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013236PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013237 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013238\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013239Return True if S ends with the specified suffix, False otherwise.\n\
13240With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013241With optional end, stop comparing S at that position.\n\
13242suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013243
13244static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013245unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013246 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013247{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013248 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013249 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013250 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013251 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013252 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253
Jesus Ceaac451502011-04-20 17:09:23 +020013254 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013255 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013256 if (PyTuple_Check(subobj)) {
13257 Py_ssize_t i;
13258 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013259 substring = PyTuple_GET_ITEM(subobj, i);
13260 if (!PyUnicode_Check(substring)) {
13261 PyErr_Format(PyExc_TypeError,
13262 "tuple for endswith must only contain str, "
13263 "not %.100s",
13264 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013265 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013266 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013267 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013268 if (result == -1)
13269 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013270 if (result) {
13271 Py_RETURN_TRUE;
13272 }
13273 }
13274 Py_RETURN_FALSE;
13275 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013276 if (!PyUnicode_Check(subobj)) {
13277 PyErr_Format(PyExc_TypeError,
13278 "endswith first arg must be str or "
13279 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013280 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013281 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013282 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013283 if (result == -1)
13284 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013285 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013286}
13287
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013288static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013289_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013290{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013291 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13292 writer->data = PyUnicode_DATA(writer->buffer);
13293
13294 if (!writer->readonly) {
13295 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013296 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013297 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013298 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013299 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13300 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13301 writer->kind = PyUnicode_WCHAR_KIND;
13302 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13303
Victor Stinner8f674cc2013-04-17 23:02:17 +020013304 /* Copy-on-write mode: set buffer size to 0 so
13305 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13306 * next write. */
13307 writer->size = 0;
13308 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013309}
13310
Victor Stinnerd3f08822012-05-29 12:57:52 +020013311void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013312_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013313{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013314 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013315
13316 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013317 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013318
13319 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13320 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13321 writer->kind = PyUnicode_WCHAR_KIND;
13322 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013323}
13324
Victor Stinnerd3f08822012-05-29 12:57:52 +020013325int
13326_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13327 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013328{
13329 Py_ssize_t newlen;
13330 PyObject *newbuffer;
13331
Victor Stinner2740e462016-09-06 16:58:36 -070013332 assert(maxchar <= MAX_UNICODE);
13333
Victor Stinnerca9381e2015-09-22 00:58:32 +020013334 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013335 assert((maxchar > writer->maxchar && length >= 0)
13336 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013337
Victor Stinner202fdca2012-05-07 12:47:02 +020013338 if (length > PY_SSIZE_T_MAX - writer->pos) {
13339 PyErr_NoMemory();
13340 return -1;
13341 }
13342 newlen = writer->pos + length;
13343
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013344 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013345
Victor Stinnerd3f08822012-05-29 12:57:52 +020013346 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013347 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013348 if (writer->overallocate
13349 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13350 /* overallocate to limit the number of realloc() */
13351 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013352 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013353 if (newlen < writer->min_length)
13354 newlen = writer->min_length;
13355
Victor Stinnerd3f08822012-05-29 12:57:52 +020013356 writer->buffer = PyUnicode_New(newlen, maxchar);
13357 if (writer->buffer == NULL)
13358 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013359 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013360 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013361 if (writer->overallocate
13362 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13363 /* overallocate to limit the number of realloc() */
13364 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013365 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013366 if (newlen < writer->min_length)
13367 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013368
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013369 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013370 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013371 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013372 newbuffer = PyUnicode_New(newlen, maxchar);
13373 if (newbuffer == NULL)
13374 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013375 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13376 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013377 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013378 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013379 }
13380 else {
13381 newbuffer = resize_compact(writer->buffer, newlen);
13382 if (newbuffer == NULL)
13383 return -1;
13384 }
13385 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013386 }
13387 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013388 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013389 newbuffer = PyUnicode_New(writer->size, maxchar);
13390 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013391 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013392 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13393 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013394 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013395 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013396 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013397 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013398
13399#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013400}
13401
Victor Stinnerca9381e2015-09-22 00:58:32 +020013402int
13403_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13404 enum PyUnicode_Kind kind)
13405{
13406 Py_UCS4 maxchar;
13407
13408 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13409 assert(writer->kind < kind);
13410
13411 switch (kind)
13412 {
13413 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13414 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13415 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13416 default:
13417 assert(0 && "invalid kind");
13418 return -1;
13419 }
13420
13421 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13422}
13423
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013424static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013425_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013426{
Victor Stinner2740e462016-09-06 16:58:36 -070013427 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013428 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13429 return -1;
13430 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13431 writer->pos++;
13432 return 0;
13433}
13434
13435int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013436_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13437{
13438 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13439}
13440
13441int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013442_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13443{
13444 Py_UCS4 maxchar;
13445 Py_ssize_t len;
13446
13447 if (PyUnicode_READY(str) == -1)
13448 return -1;
13449 len = PyUnicode_GET_LENGTH(str);
13450 if (len == 0)
13451 return 0;
13452 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13453 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013454 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013455 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013456 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013457 Py_INCREF(str);
13458 writer->buffer = str;
13459 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013460 writer->pos += len;
13461 return 0;
13462 }
13463 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13464 return -1;
13465 }
13466 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13467 str, 0, len);
13468 writer->pos += len;
13469 return 0;
13470}
13471
Victor Stinnere215d962012-10-06 23:03:36 +020013472int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013473_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13474 Py_ssize_t start, Py_ssize_t end)
13475{
13476 Py_UCS4 maxchar;
13477 Py_ssize_t len;
13478
13479 if (PyUnicode_READY(str) == -1)
13480 return -1;
13481
13482 assert(0 <= start);
13483 assert(end <= PyUnicode_GET_LENGTH(str));
13484 assert(start <= end);
13485
13486 if (end == 0)
13487 return 0;
13488
13489 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13490 return _PyUnicodeWriter_WriteStr(writer, str);
13491
13492 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13493 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13494 else
13495 maxchar = writer->maxchar;
13496 len = end - start;
13497
13498 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13499 return -1;
13500
13501 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13502 str, start, len);
13503 writer->pos += len;
13504 return 0;
13505}
13506
13507int
Victor Stinner4a587072013-11-19 12:54:53 +010013508_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13509 const char *ascii, Py_ssize_t len)
13510{
13511 if (len == -1)
13512 len = strlen(ascii);
13513
13514 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13515
13516 if (writer->buffer == NULL && !writer->overallocate) {
13517 PyObject *str;
13518
13519 str = _PyUnicode_FromASCII(ascii, len);
13520 if (str == NULL)
13521 return -1;
13522
13523 writer->readonly = 1;
13524 writer->buffer = str;
13525 _PyUnicodeWriter_Update(writer);
13526 writer->pos += len;
13527 return 0;
13528 }
13529
13530 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13531 return -1;
13532
13533 switch (writer->kind)
13534 {
13535 case PyUnicode_1BYTE_KIND:
13536 {
13537 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13538 Py_UCS1 *data = writer->data;
13539
Christian Heimesf051e432016-09-13 20:22:02 +020013540 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013541 break;
13542 }
13543 case PyUnicode_2BYTE_KIND:
13544 {
13545 _PyUnicode_CONVERT_BYTES(
13546 Py_UCS1, Py_UCS2,
13547 ascii, ascii + len,
13548 (Py_UCS2 *)writer->data + writer->pos);
13549 break;
13550 }
13551 case PyUnicode_4BYTE_KIND:
13552 {
13553 _PyUnicode_CONVERT_BYTES(
13554 Py_UCS1, Py_UCS4,
13555 ascii, ascii + len,
13556 (Py_UCS4 *)writer->data + writer->pos);
13557 break;
13558 }
13559 default:
13560 assert(0);
13561 }
13562
13563 writer->pos += len;
13564 return 0;
13565}
13566
13567int
13568_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13569 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013570{
13571 Py_UCS4 maxchar;
13572
13573 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13574 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13575 return -1;
13576 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13577 writer->pos += len;
13578 return 0;
13579}
13580
Victor Stinnerd3f08822012-05-29 12:57:52 +020013581PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013582_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013583{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013584 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013585 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013586 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013587 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013588 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013589 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013590 str = writer->buffer;
13591 writer->buffer = NULL;
13592 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13593 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013594 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013595 if (writer->pos == 0) {
13596 Py_CLEAR(writer->buffer);
13597
13598 /* Get the empty Unicode string singleton ('') */
13599 _Py_INCREF_UNICODE_EMPTY();
13600 str = unicode_empty;
Victor Stinner202fdca2012-05-07 12:47:02 +020013601 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013602 else {
13603 str = writer->buffer;
13604 writer->buffer = NULL;
13605
13606 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13607 PyObject *str2;
13608 str2 = resize_compact(str, writer->pos);
13609 if (str2 == NULL)
13610 return NULL;
13611 str = str2;
13612 }
13613 }
13614
Victor Stinner15a0bd32013-07-08 22:29:55 +020013615 assert(_PyUnicode_CheckConsistency(str, 1));
13616 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013617}
13618
Victor Stinnerd3f08822012-05-29 12:57:52 +020013619void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013620_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013621{
13622 Py_CLEAR(writer->buffer);
13623}
13624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013625#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013626
13627PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013628 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013629\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013630Return a formatted version of S, using substitutions from args and kwargs.\n\
13631The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013632
Eric Smith27bbca62010-11-04 17:06:58 +000013633PyDoc_STRVAR(format_map__doc__,
13634 "S.format_map(mapping) -> str\n\
13635\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013636Return a formatted version of S, using substitutions from mapping.\n\
13637The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013638
Eric Smith4a7d76d2008-05-30 18:10:19 +000013639static PyObject *
13640unicode__format__(PyObject* self, PyObject* args)
13641{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013642 PyObject *format_spec;
13643 _PyUnicodeWriter writer;
13644 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013645
13646 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13647 return NULL;
13648
Victor Stinnerd3f08822012-05-29 12:57:52 +020013649 if (PyUnicode_READY(self) == -1)
13650 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013651 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013652 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13653 self, format_spec, 0,
13654 PyUnicode_GET_LENGTH(format_spec));
13655 if (ret == -1) {
13656 _PyUnicodeWriter_Dealloc(&writer);
13657 return NULL;
13658 }
13659 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013660}
13661
Eric Smith8c663262007-08-25 02:26:07 +000013662PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013663 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013664\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013665Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013666
13667static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013668unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013669{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013670 Py_ssize_t size;
13671
13672 /* If it's a compact object, account for base structure +
13673 character data. */
13674 if (PyUnicode_IS_COMPACT_ASCII(v))
13675 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13676 else if (PyUnicode_IS_COMPACT(v))
13677 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013678 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013679 else {
13680 /* If it is a two-block object, account for base object, and
13681 for character block if present. */
13682 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013683 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013684 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013685 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013686 }
13687 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013688 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013689 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013690 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013691 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013692 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013693
13694 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013695}
13696
13697PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013698 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013699
13700static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013701unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013702{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013703 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013704 if (!copy)
13705 return NULL;
13706 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013707}
13708
Guido van Rossumd57fd912000-03-10 22:53:23 +000013709static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013710 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013711 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013712 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13713 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013714 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13715 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013716 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013717 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13718 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13719 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013720 {"expandtabs", (PyCFunction) unicode_expandtabs,
13721 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013722 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013723 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013724 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13725 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13726 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013727 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013728 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13729 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13730 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013731 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013732 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013733 {"splitlines", (PyCFunction) unicode_splitlines,
13734 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013735 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013736 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13737 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13738 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13739 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13740 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13741 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13742 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13743 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13744 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13745 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13746 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13747 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13748 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13749 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013750 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013751 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013752 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013753 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013754 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013755 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013756 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013757 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013758#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013759 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013760 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013761#endif
13762
Benjamin Peterson14339b62009-01-31 16:36:08 +000013763 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013764 {NULL, NULL}
13765};
13766
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013767static PyObject *
13768unicode_mod(PyObject *v, PyObject *w)
13769{
Brian Curtindfc80e32011-08-10 20:28:54 -050013770 if (!PyUnicode_Check(v))
13771 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013772 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013773}
13774
13775static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013776 0, /*nb_add*/
13777 0, /*nb_subtract*/
13778 0, /*nb_multiply*/
13779 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013780};
13781
Guido van Rossumd57fd912000-03-10 22:53:23 +000013782static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013783 (lenfunc) unicode_length, /* sq_length */
13784 PyUnicode_Concat, /* sq_concat */
13785 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13786 (ssizeargfunc) unicode_getitem, /* sq_item */
13787 0, /* sq_slice */
13788 0, /* sq_ass_item */
13789 0, /* sq_ass_slice */
13790 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013791};
13792
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013793static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013794unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013795{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013796 if (PyUnicode_READY(self) == -1)
13797 return NULL;
13798
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013799 if (PyIndex_Check(item)) {
13800 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013801 if (i == -1 && PyErr_Occurred())
13802 return NULL;
13803 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013804 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013805 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013806 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013807 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013808 PyObject *result;
13809 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013810 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013811 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013813 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013814 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013815 return NULL;
13816 }
13817
13818 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013819 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013820 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013821 slicelength == PyUnicode_GET_LENGTH(self)) {
13822 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013823 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013824 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013825 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013826 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013827 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013828 src_kind = PyUnicode_KIND(self);
13829 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013830 if (!PyUnicode_IS_ASCII(self)) {
13831 kind_limit = kind_maxchar_limit(src_kind);
13832 max_char = 0;
13833 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13834 ch = PyUnicode_READ(src_kind, src_data, cur);
13835 if (ch > max_char) {
13836 max_char = ch;
13837 if (max_char >= kind_limit)
13838 break;
13839 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013840 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013841 }
Victor Stinner55c99112011-10-13 01:17:06 +020013842 else
13843 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013844 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013845 if (result == NULL)
13846 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013847 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013848 dest_data = PyUnicode_DATA(result);
13849
13850 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013851 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13852 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013853 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013854 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013855 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013856 } else {
13857 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13858 return NULL;
13859 }
13860}
13861
13862static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013863 (lenfunc)unicode_length, /* mp_length */
13864 (binaryfunc)unicode_subscript, /* mp_subscript */
13865 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013866};
13867
Guido van Rossumd57fd912000-03-10 22:53:23 +000013868
Guido van Rossumd57fd912000-03-10 22:53:23 +000013869/* Helpers for PyUnicode_Format() */
13870
Victor Stinnera47082312012-10-04 02:19:54 +020013871struct unicode_formatter_t {
13872 PyObject *args;
13873 int args_owned;
13874 Py_ssize_t arglen, argidx;
13875 PyObject *dict;
13876
13877 enum PyUnicode_Kind fmtkind;
13878 Py_ssize_t fmtcnt, fmtpos;
13879 void *fmtdata;
13880 PyObject *fmtstr;
13881
13882 _PyUnicodeWriter writer;
13883};
13884
13885struct unicode_format_arg_t {
13886 Py_UCS4 ch;
13887 int flags;
13888 Py_ssize_t width;
13889 int prec;
13890 int sign;
13891};
13892
Guido van Rossumd57fd912000-03-10 22:53:23 +000013893static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013894unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013895{
Victor Stinnera47082312012-10-04 02:19:54 +020013896 Py_ssize_t argidx = ctx->argidx;
13897
13898 if (argidx < ctx->arglen) {
13899 ctx->argidx++;
13900 if (ctx->arglen < 0)
13901 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013902 else
Victor Stinnera47082312012-10-04 02:19:54 +020013903 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013904 }
13905 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013906 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013907 return NULL;
13908}
13909
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013910/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013911
Victor Stinnera47082312012-10-04 02:19:54 +020013912/* Format a float into the writer if the writer is not NULL, or into *p_output
13913 otherwise.
13914
13915 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013916static int
Victor Stinnera47082312012-10-04 02:19:54 +020013917formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13918 PyObject **p_output,
13919 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013920{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013921 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013922 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013923 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013924 int prec;
13925 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013926
Guido van Rossumd57fd912000-03-10 22:53:23 +000013927 x = PyFloat_AsDouble(v);
13928 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013929 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013930
Victor Stinnera47082312012-10-04 02:19:54 +020013931 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013932 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013933 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013934
Victor Stinnera47082312012-10-04 02:19:54 +020013935 if (arg->flags & F_ALT)
13936 dtoa_flags = Py_DTSF_ALT;
13937 else
13938 dtoa_flags = 0;
13939 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013940 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013941 return -1;
13942 len = strlen(p);
13943 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013944 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013945 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013946 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013947 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013948 }
13949 else
13950 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013951 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013952 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013953}
13954
Victor Stinnerd0880d52012-04-27 23:40:13 +020013955/* formatlong() emulates the format codes d, u, o, x and X, and
13956 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13957 * Python's regular ints.
13958 * Return value: a new PyUnicodeObject*, or NULL if error.
13959 * The output string is of the form
13960 * "-"? ("0x" | "0X")? digit+
13961 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13962 * set in flags. The case of hex digits will be correct,
13963 * There will be at least prec digits, zero-filled on the left if
13964 * necessary to get that many.
13965 * val object to be converted
13966 * flags bitmask of format flags; only F_ALT is looked at
13967 * prec minimum number of digits; 0-fill on left if needed
13968 * type a character in [duoxX]; u acts the same as d
13969 *
13970 * CAUTION: o, x and X conversions on regular ints can never
13971 * produce a '-' sign, but can for Python's unbounded ints.
13972 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013973PyObject *
13974_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013975{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013976 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013977 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013978 Py_ssize_t i;
13979 int sign; /* 1 if '-', else 0 */
13980 int len; /* number of characters */
13981 Py_ssize_t llen;
13982 int numdigits; /* len == numnondigits + numdigits */
13983 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013984
Victor Stinnerd0880d52012-04-27 23:40:13 +020013985 /* Avoid exceeding SSIZE_T_MAX */
13986 if (prec > INT_MAX-3) {
13987 PyErr_SetString(PyExc_OverflowError,
13988 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013989 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013990 }
13991
13992 assert(PyLong_Check(val));
13993
13994 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013995 default:
13996 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013997 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013998 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013999 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014000 /* int and int subclasses should print numerically when a numeric */
14001 /* format code is used (see issue18780) */
14002 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014003 break;
14004 case 'o':
14005 numnondigits = 2;
14006 result = PyNumber_ToBase(val, 8);
14007 break;
14008 case 'x':
14009 case 'X':
14010 numnondigits = 2;
14011 result = PyNumber_ToBase(val, 16);
14012 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014013 }
14014 if (!result)
14015 return NULL;
14016
14017 assert(unicode_modifiable(result));
14018 assert(PyUnicode_IS_READY(result));
14019 assert(PyUnicode_IS_ASCII(result));
14020
14021 /* To modify the string in-place, there can only be one reference. */
14022 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014023 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014024 PyErr_BadInternalCall();
14025 return NULL;
14026 }
14027 buf = PyUnicode_DATA(result);
14028 llen = PyUnicode_GET_LENGTH(result);
14029 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014030 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014031 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014032 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014033 return NULL;
14034 }
14035 len = (int)llen;
14036 sign = buf[0] == '-';
14037 numnondigits += sign;
14038 numdigits = len - numnondigits;
14039 assert(numdigits > 0);
14040
14041 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014042 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014043 (type == 'o' || type == 'x' || type == 'X'))) {
14044 assert(buf[sign] == '0');
14045 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14046 buf[sign+1] == 'o');
14047 numnondigits -= 2;
14048 buf += 2;
14049 len -= 2;
14050 if (sign)
14051 buf[0] = '-';
14052 assert(len == numnondigits + numdigits);
14053 assert(numdigits > 0);
14054 }
14055
14056 /* Fill with leading zeroes to meet minimum width. */
14057 if (prec > numdigits) {
14058 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14059 numnondigits + prec);
14060 char *b1;
14061 if (!r1) {
14062 Py_DECREF(result);
14063 return NULL;
14064 }
14065 b1 = PyBytes_AS_STRING(r1);
14066 for (i = 0; i < numnondigits; ++i)
14067 *b1++ = *buf++;
14068 for (i = 0; i < prec - numdigits; i++)
14069 *b1++ = '0';
14070 for (i = 0; i < numdigits; i++)
14071 *b1++ = *buf++;
14072 *b1 = '\0';
14073 Py_DECREF(result);
14074 result = r1;
14075 buf = PyBytes_AS_STRING(result);
14076 len = numnondigits + prec;
14077 }
14078
14079 /* Fix up case for hex conversions. */
14080 if (type == 'X') {
14081 /* Need to convert all lower case letters to upper case.
14082 and need to convert 0x to 0X (and -0x to -0X). */
14083 for (i = 0; i < len; i++)
14084 if (buf[i] >= 'a' && buf[i] <= 'x')
14085 buf[i] -= 'a'-'A';
14086 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014087 if (!PyUnicode_Check(result)
14088 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014089 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014090 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014091 Py_DECREF(result);
14092 result = unicode;
14093 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014094 else if (len != PyUnicode_GET_LENGTH(result)) {
14095 if (PyUnicode_Resize(&result, len) < 0)
14096 Py_CLEAR(result);
14097 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014098 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014099}
14100
Ethan Furmandf3ed242014-01-05 06:50:30 -080014101/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014102 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014103 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014104 * -1 and raise an exception on error */
14105static int
Victor Stinnera47082312012-10-04 02:19:54 +020014106mainformatlong(PyObject *v,
14107 struct unicode_format_arg_t *arg,
14108 PyObject **p_output,
14109 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014110{
14111 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014112 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014113
14114 if (!PyNumber_Check(v))
14115 goto wrongtype;
14116
Ethan Furman9ab74802014-03-21 06:38:46 -070014117 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014118 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014119 if (type == 'o' || type == 'x' || type == 'X') {
14120 iobj = PyNumber_Index(v);
14121 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014122 if (PyErr_ExceptionMatches(PyExc_TypeError))
14123 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014124 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014125 }
14126 }
14127 else {
14128 iobj = PyNumber_Long(v);
14129 if (iobj == NULL ) {
14130 if (PyErr_ExceptionMatches(PyExc_TypeError))
14131 goto wrongtype;
14132 return -1;
14133 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014134 }
14135 assert(PyLong_Check(iobj));
14136 }
14137 else {
14138 iobj = v;
14139 Py_INCREF(iobj);
14140 }
14141
14142 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014143 && arg->width == -1 && arg->prec == -1
14144 && !(arg->flags & (F_SIGN | F_BLANK))
14145 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014146 {
14147 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014148 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014149 int base;
14150
Victor Stinnera47082312012-10-04 02:19:54 +020014151 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014152 {
14153 default:
14154 assert(0 && "'type' not in [diuoxX]");
14155 case 'd':
14156 case 'i':
14157 case 'u':
14158 base = 10;
14159 break;
14160 case 'o':
14161 base = 8;
14162 break;
14163 case 'x':
14164 case 'X':
14165 base = 16;
14166 break;
14167 }
14168
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014169 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14170 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014171 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014172 }
14173 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014174 return 1;
14175 }
14176
Ethan Furmanb95b5612015-01-23 20:05:18 -080014177 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014178 Py_DECREF(iobj);
14179 if (res == NULL)
14180 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014181 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014182 return 0;
14183
14184wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014185 switch(type)
14186 {
14187 case 'o':
14188 case 'x':
14189 case 'X':
14190 PyErr_Format(PyExc_TypeError,
14191 "%%%c format: an integer is required, "
14192 "not %.200s",
14193 type, Py_TYPE(v)->tp_name);
14194 break;
14195 default:
14196 PyErr_Format(PyExc_TypeError,
14197 "%%%c format: a number is required, "
14198 "not %.200s",
14199 type, Py_TYPE(v)->tp_name);
14200 break;
14201 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014202 return -1;
14203}
14204
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014205static Py_UCS4
14206formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014207{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014208 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014209 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014210 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014211 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014212 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014213 goto onError;
14214 }
14215 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014216 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014217 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014218 /* make sure number is a type of integer */
14219 if (!PyLong_Check(v)) {
14220 iobj = PyNumber_Index(v);
14221 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014222 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014223 }
14224 v = iobj;
14225 Py_DECREF(iobj);
14226 }
14227 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014228 x = PyLong_AsLong(v);
14229 if (x == -1 && PyErr_Occurred())
14230 goto onError;
14231
Victor Stinner8faf8212011-12-08 22:14:11 +010014232 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014233 PyErr_SetString(PyExc_OverflowError,
14234 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014235 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014236 }
14237
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014238 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014239 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014240
Benjamin Peterson29060642009-01-31 22:14:21 +000014241 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014242 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014243 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014244 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014245}
14246
Victor Stinnera47082312012-10-04 02:19:54 +020014247/* Parse options of an argument: flags, width, precision.
14248 Handle also "%(name)" syntax.
14249
14250 Return 0 if the argument has been formatted into arg->str.
14251 Return 1 if the argument has been written into ctx->writer,
14252 Raise an exception and return -1 on error. */
14253static int
14254unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14255 struct unicode_format_arg_t *arg)
14256{
14257#define FORMAT_READ(ctx) \
14258 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14259
14260 PyObject *v;
14261
Victor Stinnera47082312012-10-04 02:19:54 +020014262 if (arg->ch == '(') {
14263 /* Get argument value from a dictionary. Example: "%(name)s". */
14264 Py_ssize_t keystart;
14265 Py_ssize_t keylen;
14266 PyObject *key;
14267 int pcount = 1;
14268
14269 if (ctx->dict == NULL) {
14270 PyErr_SetString(PyExc_TypeError,
14271 "format requires a mapping");
14272 return -1;
14273 }
14274 ++ctx->fmtpos;
14275 --ctx->fmtcnt;
14276 keystart = ctx->fmtpos;
14277 /* Skip over balanced parentheses */
14278 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14279 arg->ch = FORMAT_READ(ctx);
14280 if (arg->ch == ')')
14281 --pcount;
14282 else if (arg->ch == '(')
14283 ++pcount;
14284 ctx->fmtpos++;
14285 }
14286 keylen = ctx->fmtpos - keystart - 1;
14287 if (ctx->fmtcnt < 0 || pcount > 0) {
14288 PyErr_SetString(PyExc_ValueError,
14289 "incomplete format key");
14290 return -1;
14291 }
14292 key = PyUnicode_Substring(ctx->fmtstr,
14293 keystart, keystart + keylen);
14294 if (key == NULL)
14295 return -1;
14296 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014297 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014298 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014299 }
14300 ctx->args = PyObject_GetItem(ctx->dict, key);
14301 Py_DECREF(key);
14302 if (ctx->args == NULL)
14303 return -1;
14304 ctx->args_owned = 1;
14305 ctx->arglen = -1;
14306 ctx->argidx = -2;
14307 }
14308
14309 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014310 while (--ctx->fmtcnt >= 0) {
14311 arg->ch = FORMAT_READ(ctx);
14312 ctx->fmtpos++;
14313 switch (arg->ch) {
14314 case '-': arg->flags |= F_LJUST; continue;
14315 case '+': arg->flags |= F_SIGN; continue;
14316 case ' ': arg->flags |= F_BLANK; continue;
14317 case '#': arg->flags |= F_ALT; continue;
14318 case '0': arg->flags |= F_ZERO; continue;
14319 }
14320 break;
14321 }
14322
14323 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014324 if (arg->ch == '*') {
14325 v = unicode_format_getnextarg(ctx);
14326 if (v == NULL)
14327 return -1;
14328 if (!PyLong_Check(v)) {
14329 PyErr_SetString(PyExc_TypeError,
14330 "* wants int");
14331 return -1;
14332 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014333 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014334 if (arg->width == -1 && PyErr_Occurred())
14335 return -1;
14336 if (arg->width < 0) {
14337 arg->flags |= F_LJUST;
14338 arg->width = -arg->width;
14339 }
14340 if (--ctx->fmtcnt >= 0) {
14341 arg->ch = FORMAT_READ(ctx);
14342 ctx->fmtpos++;
14343 }
14344 }
14345 else if (arg->ch >= '0' && arg->ch <= '9') {
14346 arg->width = arg->ch - '0';
14347 while (--ctx->fmtcnt >= 0) {
14348 arg->ch = FORMAT_READ(ctx);
14349 ctx->fmtpos++;
14350 if (arg->ch < '0' || arg->ch > '9')
14351 break;
14352 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14353 mixing signed and unsigned comparison. Since arg->ch is between
14354 '0' and '9', casting to int is safe. */
14355 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14356 PyErr_SetString(PyExc_ValueError,
14357 "width too big");
14358 return -1;
14359 }
14360 arg->width = arg->width*10 + (arg->ch - '0');
14361 }
14362 }
14363
14364 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014365 if (arg->ch == '.') {
14366 arg->prec = 0;
14367 if (--ctx->fmtcnt >= 0) {
14368 arg->ch = FORMAT_READ(ctx);
14369 ctx->fmtpos++;
14370 }
14371 if (arg->ch == '*') {
14372 v = unicode_format_getnextarg(ctx);
14373 if (v == NULL)
14374 return -1;
14375 if (!PyLong_Check(v)) {
14376 PyErr_SetString(PyExc_TypeError,
14377 "* wants int");
14378 return -1;
14379 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014380 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014381 if (arg->prec == -1 && PyErr_Occurred())
14382 return -1;
14383 if (arg->prec < 0)
14384 arg->prec = 0;
14385 if (--ctx->fmtcnt >= 0) {
14386 arg->ch = FORMAT_READ(ctx);
14387 ctx->fmtpos++;
14388 }
14389 }
14390 else if (arg->ch >= '0' && arg->ch <= '9') {
14391 arg->prec = arg->ch - '0';
14392 while (--ctx->fmtcnt >= 0) {
14393 arg->ch = FORMAT_READ(ctx);
14394 ctx->fmtpos++;
14395 if (arg->ch < '0' || arg->ch > '9')
14396 break;
14397 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14398 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014399 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014400 return -1;
14401 }
14402 arg->prec = arg->prec*10 + (arg->ch - '0');
14403 }
14404 }
14405 }
14406
14407 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14408 if (ctx->fmtcnt >= 0) {
14409 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14410 if (--ctx->fmtcnt >= 0) {
14411 arg->ch = FORMAT_READ(ctx);
14412 ctx->fmtpos++;
14413 }
14414 }
14415 }
14416 if (ctx->fmtcnt < 0) {
14417 PyErr_SetString(PyExc_ValueError,
14418 "incomplete format");
14419 return -1;
14420 }
14421 return 0;
14422
14423#undef FORMAT_READ
14424}
14425
14426/* Format one argument. Supported conversion specifiers:
14427
14428 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014429 - "i", "d", "u": int or float
14430 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014431 - "e", "E", "f", "F", "g", "G": float
14432 - "c": int or str (1 character)
14433
Victor Stinner8dbd4212012-12-04 09:30:24 +010014434 When possible, the output is written directly into the Unicode writer
14435 (ctx->writer). A string is created when padding is required.
14436
Victor Stinnera47082312012-10-04 02:19:54 +020014437 Return 0 if the argument has been formatted into *p_str,
14438 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014439 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014440static int
14441unicode_format_arg_format(struct unicode_formatter_t *ctx,
14442 struct unicode_format_arg_t *arg,
14443 PyObject **p_str)
14444{
14445 PyObject *v;
14446 _PyUnicodeWriter *writer = &ctx->writer;
14447
14448 if (ctx->fmtcnt == 0)
14449 ctx->writer.overallocate = 0;
14450
14451 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014452 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014453 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014454 return 1;
14455 }
14456
14457 v = unicode_format_getnextarg(ctx);
14458 if (v == NULL)
14459 return -1;
14460
Victor Stinnera47082312012-10-04 02:19:54 +020014461
14462 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014463 case 's':
14464 case 'r':
14465 case 'a':
14466 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14467 /* Fast path */
14468 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14469 return -1;
14470 return 1;
14471 }
14472
14473 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14474 *p_str = v;
14475 Py_INCREF(*p_str);
14476 }
14477 else {
14478 if (arg->ch == 's')
14479 *p_str = PyObject_Str(v);
14480 else if (arg->ch == 'r')
14481 *p_str = PyObject_Repr(v);
14482 else
14483 *p_str = PyObject_ASCII(v);
14484 }
14485 break;
14486
14487 case 'i':
14488 case 'd':
14489 case 'u':
14490 case 'o':
14491 case 'x':
14492 case 'X':
14493 {
14494 int ret = mainformatlong(v, arg, p_str, writer);
14495 if (ret != 0)
14496 return ret;
14497 arg->sign = 1;
14498 break;
14499 }
14500
14501 case 'e':
14502 case 'E':
14503 case 'f':
14504 case 'F':
14505 case 'g':
14506 case 'G':
14507 if (arg->width == -1 && arg->prec == -1
14508 && !(arg->flags & (F_SIGN | F_BLANK)))
14509 {
14510 /* Fast path */
14511 if (formatfloat(v, arg, NULL, writer) == -1)
14512 return -1;
14513 return 1;
14514 }
14515
14516 arg->sign = 1;
14517 if (formatfloat(v, arg, p_str, NULL) == -1)
14518 return -1;
14519 break;
14520
14521 case 'c':
14522 {
14523 Py_UCS4 ch = formatchar(v);
14524 if (ch == (Py_UCS4) -1)
14525 return -1;
14526 if (arg->width == -1 && arg->prec == -1) {
14527 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014528 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014529 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014530 return 1;
14531 }
14532 *p_str = PyUnicode_FromOrdinal(ch);
14533 break;
14534 }
14535
14536 default:
14537 PyErr_Format(PyExc_ValueError,
14538 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014539 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014540 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14541 (int)arg->ch,
14542 ctx->fmtpos - 1);
14543 return -1;
14544 }
14545 if (*p_str == NULL)
14546 return -1;
14547 assert (PyUnicode_Check(*p_str));
14548 return 0;
14549}
14550
14551static int
14552unicode_format_arg_output(struct unicode_formatter_t *ctx,
14553 struct unicode_format_arg_t *arg,
14554 PyObject *str)
14555{
14556 Py_ssize_t len;
14557 enum PyUnicode_Kind kind;
14558 void *pbuf;
14559 Py_ssize_t pindex;
14560 Py_UCS4 signchar;
14561 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014562 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014563 Py_ssize_t sublen;
14564 _PyUnicodeWriter *writer = &ctx->writer;
14565 Py_UCS4 fill;
14566
14567 fill = ' ';
14568 if (arg->sign && arg->flags & F_ZERO)
14569 fill = '0';
14570
14571 if (PyUnicode_READY(str) == -1)
14572 return -1;
14573
14574 len = PyUnicode_GET_LENGTH(str);
14575 if ((arg->width == -1 || arg->width <= len)
14576 && (arg->prec == -1 || arg->prec >= len)
14577 && !(arg->flags & (F_SIGN | F_BLANK)))
14578 {
14579 /* Fast path */
14580 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14581 return -1;
14582 return 0;
14583 }
14584
14585 /* Truncate the string for "s", "r" and "a" formats
14586 if the precision is set */
14587 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14588 if (arg->prec >= 0 && len > arg->prec)
14589 len = arg->prec;
14590 }
14591
14592 /* Adjust sign and width */
14593 kind = PyUnicode_KIND(str);
14594 pbuf = PyUnicode_DATA(str);
14595 pindex = 0;
14596 signchar = '\0';
14597 if (arg->sign) {
14598 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14599 if (ch == '-' || ch == '+') {
14600 signchar = ch;
14601 len--;
14602 pindex++;
14603 }
14604 else if (arg->flags & F_SIGN)
14605 signchar = '+';
14606 else if (arg->flags & F_BLANK)
14607 signchar = ' ';
14608 else
14609 arg->sign = 0;
14610 }
14611 if (arg->width < len)
14612 arg->width = len;
14613
14614 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014615 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014616 if (!(arg->flags & F_LJUST)) {
14617 if (arg->sign) {
14618 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014619 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014620 }
14621 else {
14622 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014623 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014624 }
14625 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014626 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14627 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014628 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014629 }
14630
Victor Stinnera47082312012-10-04 02:19:54 +020014631 buflen = arg->width;
14632 if (arg->sign && len == arg->width)
14633 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014634 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014635 return -1;
14636
14637 /* Write the sign if needed */
14638 if (arg->sign) {
14639 if (fill != ' ') {
14640 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14641 writer->pos += 1;
14642 }
14643 if (arg->width > len)
14644 arg->width--;
14645 }
14646
14647 /* Write the numeric prefix for "x", "X" and "o" formats
14648 if the alternate form is used.
14649 For example, write "0x" for the "%#x" format. */
14650 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14651 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14652 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14653 if (fill != ' ') {
14654 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14655 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14656 writer->pos += 2;
14657 pindex += 2;
14658 }
14659 arg->width -= 2;
14660 if (arg->width < 0)
14661 arg->width = 0;
14662 len -= 2;
14663 }
14664
14665 /* Pad left with the fill character if needed */
14666 if (arg->width > len && !(arg->flags & F_LJUST)) {
14667 sublen = arg->width - len;
14668 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14669 writer->pos += sublen;
14670 arg->width = len;
14671 }
14672
14673 /* If padding with spaces: write sign if needed and/or numeric prefix if
14674 the alternate form is used */
14675 if (fill == ' ') {
14676 if (arg->sign) {
14677 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14678 writer->pos += 1;
14679 }
14680 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14681 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14682 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14683 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14684 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14685 writer->pos += 2;
14686 pindex += 2;
14687 }
14688 }
14689
14690 /* Write characters */
14691 if (len) {
14692 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14693 str, pindex, len);
14694 writer->pos += len;
14695 }
14696
14697 /* Pad right with the fill character if needed */
14698 if (arg->width > len) {
14699 sublen = arg->width - len;
14700 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14701 writer->pos += sublen;
14702 }
14703 return 0;
14704}
14705
14706/* Helper of PyUnicode_Format(): format one arg.
14707 Return 0 on success, raise an exception and return -1 on error. */
14708static int
14709unicode_format_arg(struct unicode_formatter_t *ctx)
14710{
14711 struct unicode_format_arg_t arg;
14712 PyObject *str;
14713 int ret;
14714
Victor Stinner8dbd4212012-12-04 09:30:24 +010014715 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14716 arg.flags = 0;
14717 arg.width = -1;
14718 arg.prec = -1;
14719 arg.sign = 0;
14720 str = NULL;
14721
Victor Stinnera47082312012-10-04 02:19:54 +020014722 ret = unicode_format_arg_parse(ctx, &arg);
14723 if (ret == -1)
14724 return -1;
14725
14726 ret = unicode_format_arg_format(ctx, &arg, &str);
14727 if (ret == -1)
14728 return -1;
14729
14730 if (ret != 1) {
14731 ret = unicode_format_arg_output(ctx, &arg, str);
14732 Py_DECREF(str);
14733 if (ret == -1)
14734 return -1;
14735 }
14736
14737 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14738 PyErr_SetString(PyExc_TypeError,
14739 "not all arguments converted during string formatting");
14740 return -1;
14741 }
14742 return 0;
14743}
14744
Alexander Belopolsky40018472011-02-26 01:02:56 +000014745PyObject *
14746PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014747{
Victor Stinnera47082312012-10-04 02:19:54 +020014748 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014749
Guido van Rossumd57fd912000-03-10 22:53:23 +000014750 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014751 PyErr_BadInternalCall();
14752 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014753 }
Victor Stinnera47082312012-10-04 02:19:54 +020014754
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014755 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014756 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014757
14758 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014759 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14760 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14761 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14762 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014763
Victor Stinner8f674cc2013-04-17 23:02:17 +020014764 _PyUnicodeWriter_Init(&ctx.writer);
14765 ctx.writer.min_length = ctx.fmtcnt + 100;
14766 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014767
Guido van Rossumd57fd912000-03-10 22:53:23 +000014768 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014769 ctx.arglen = PyTuple_Size(args);
14770 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014771 }
14772 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014773 ctx.arglen = -1;
14774 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014775 }
Victor Stinnera47082312012-10-04 02:19:54 +020014776 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014777 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014778 ctx.dict = args;
14779 else
14780 ctx.dict = NULL;
14781 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014782
Victor Stinnera47082312012-10-04 02:19:54 +020014783 while (--ctx.fmtcnt >= 0) {
14784 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014785 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014786
14787 nonfmtpos = ctx.fmtpos++;
14788 while (ctx.fmtcnt >= 0 &&
14789 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14790 ctx.fmtpos++;
14791 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014792 }
Victor Stinnera47082312012-10-04 02:19:54 +020014793 if (ctx.fmtcnt < 0) {
14794 ctx.fmtpos--;
14795 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014796 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014797
Victor Stinnercfc4c132013-04-03 01:48:39 +020014798 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14799 nonfmtpos, ctx.fmtpos) < 0)
14800 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014801 }
14802 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014803 ctx.fmtpos++;
14804 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014805 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014806 }
14807 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014808
Victor Stinnera47082312012-10-04 02:19:54 +020014809 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014810 PyErr_SetString(PyExc_TypeError,
14811 "not all arguments converted during string formatting");
14812 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014813 }
14814
Victor Stinnera47082312012-10-04 02:19:54 +020014815 if (ctx.args_owned) {
14816 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014817 }
Victor Stinnera47082312012-10-04 02:19:54 +020014818 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014819
Benjamin Peterson29060642009-01-31 22:14:21 +000014820 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014821 _PyUnicodeWriter_Dealloc(&ctx.writer);
14822 if (ctx.args_owned) {
14823 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014824 }
14825 return NULL;
14826}
14827
Jeremy Hylton938ace62002-07-17 16:30:39 +000014828static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014829unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14830
Tim Peters6d6c1a32001-08-02 04:15:00 +000014831static PyObject *
14832unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14833{
Benjamin Peterson29060642009-01-31 22:14:21 +000014834 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014835 static char *kwlist[] = {"object", "encoding", "errors", 0};
14836 char *encoding = NULL;
14837 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014838
Benjamin Peterson14339b62009-01-31 16:36:08 +000014839 if (type != &PyUnicode_Type)
14840 return unicode_subtype_new(type, args, kwds);
14841 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014842 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014843 return NULL;
14844 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014845 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014846 if (encoding == NULL && errors == NULL)
14847 return PyObject_Str(x);
14848 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014849 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014850}
14851
Guido van Rossume023fe02001-08-30 03:12:59 +000014852static PyObject *
14853unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14854{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014855 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014856 Py_ssize_t length, char_size;
14857 int share_wstr, share_utf8;
14858 unsigned int kind;
14859 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014860
Benjamin Peterson14339b62009-01-31 16:36:08 +000014861 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014862
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014863 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014864 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014865 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014866 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014867 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014868 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014869 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014870 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014871
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014872 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014873 if (self == NULL) {
14874 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014875 return NULL;
14876 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014877 kind = PyUnicode_KIND(unicode);
14878 length = PyUnicode_GET_LENGTH(unicode);
14879
14880 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014881#ifdef Py_DEBUG
14882 _PyUnicode_HASH(self) = -1;
14883#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014884 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014885#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014886 _PyUnicode_STATE(self).interned = 0;
14887 _PyUnicode_STATE(self).kind = kind;
14888 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014889 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014890 _PyUnicode_STATE(self).ready = 1;
14891 _PyUnicode_WSTR(self) = NULL;
14892 _PyUnicode_UTF8_LENGTH(self) = 0;
14893 _PyUnicode_UTF8(self) = NULL;
14894 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014895 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014896
14897 share_utf8 = 0;
14898 share_wstr = 0;
14899 if (kind == PyUnicode_1BYTE_KIND) {
14900 char_size = 1;
14901 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14902 share_utf8 = 1;
14903 }
14904 else if (kind == PyUnicode_2BYTE_KIND) {
14905 char_size = 2;
14906 if (sizeof(wchar_t) == 2)
14907 share_wstr = 1;
14908 }
14909 else {
14910 assert(kind == PyUnicode_4BYTE_KIND);
14911 char_size = 4;
14912 if (sizeof(wchar_t) == 4)
14913 share_wstr = 1;
14914 }
14915
14916 /* Ensure we won't overflow the length. */
14917 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14918 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014919 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014920 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014921 data = PyObject_MALLOC((length + 1) * char_size);
14922 if (data == NULL) {
14923 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014924 goto onError;
14925 }
14926
Victor Stinnerc3c74152011-10-02 20:39:55 +020014927 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014928 if (share_utf8) {
14929 _PyUnicode_UTF8_LENGTH(self) = length;
14930 _PyUnicode_UTF8(self) = data;
14931 }
14932 if (share_wstr) {
14933 _PyUnicode_WSTR_LENGTH(self) = length;
14934 _PyUnicode_WSTR(self) = (wchar_t *)data;
14935 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014936
Christian Heimesf051e432016-09-13 20:22:02 +020014937 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014938 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014939 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014940#ifdef Py_DEBUG
14941 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14942#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014943 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014944 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014945
14946onError:
14947 Py_DECREF(unicode);
14948 Py_DECREF(self);
14949 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014950}
14951
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014952PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014953"str(object='') -> str\n\
14954str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014955\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014956Create a new string object from the given object. If encoding or\n\
14957errors is specified, then the object must expose a data buffer\n\
14958that will be decoded using the given encoding and error handler.\n\
14959Otherwise, returns the result of object.__str__() (if defined)\n\
14960or repr(object).\n\
14961encoding defaults to sys.getdefaultencoding().\n\
14962errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014963
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014964static PyObject *unicode_iter(PyObject *seq);
14965
Guido van Rossumd57fd912000-03-10 22:53:23 +000014966PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014967 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014968 "str", /* tp_name */
14969 sizeof(PyUnicodeObject), /* tp_size */
14970 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014971 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014972 (destructor)unicode_dealloc, /* tp_dealloc */
14973 0, /* tp_print */
14974 0, /* tp_getattr */
14975 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014976 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014977 unicode_repr, /* tp_repr */
14978 &unicode_as_number, /* tp_as_number */
14979 &unicode_as_sequence, /* tp_as_sequence */
14980 &unicode_as_mapping, /* tp_as_mapping */
14981 (hashfunc) unicode_hash, /* tp_hash*/
14982 0, /* tp_call*/
14983 (reprfunc) unicode_str, /* tp_str */
14984 PyObject_GenericGetAttr, /* tp_getattro */
14985 0, /* tp_setattro */
14986 0, /* tp_as_buffer */
14987 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014988 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014989 unicode_doc, /* tp_doc */
14990 0, /* tp_traverse */
14991 0, /* tp_clear */
14992 PyUnicode_RichCompare, /* tp_richcompare */
14993 0, /* tp_weaklistoffset */
14994 unicode_iter, /* tp_iter */
14995 0, /* tp_iternext */
14996 unicode_methods, /* tp_methods */
14997 0, /* tp_members */
14998 0, /* tp_getset */
14999 &PyBaseObject_Type, /* tp_base */
15000 0, /* tp_dict */
15001 0, /* tp_descr_get */
15002 0, /* tp_descr_set */
15003 0, /* tp_dictoffset */
15004 0, /* tp_init */
15005 0, /* tp_alloc */
15006 unicode_new, /* tp_new */
15007 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015008};
15009
15010/* Initialize the Unicode implementation */
15011
Victor Stinner3a50e702011-10-18 21:21:00 +020015012int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015013{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015014 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015015 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015016 0x000A, /* LINE FEED */
15017 0x000D, /* CARRIAGE RETURN */
15018 0x001C, /* FILE SEPARATOR */
15019 0x001D, /* GROUP SEPARATOR */
15020 0x001E, /* RECORD SEPARATOR */
15021 0x0085, /* NEXT LINE */
15022 0x2028, /* LINE SEPARATOR */
15023 0x2029, /* PARAGRAPH SEPARATOR */
15024 };
15025
Fred Drakee4315f52000-05-09 19:53:39 +000015026 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015027 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015028 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015029 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015030 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015031
Guido van Rossumcacfc072002-05-24 19:01:59 +000015032 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015033 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015034
15035 /* initialize the linebreak bloom filter */
15036 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015037 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015038 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015039
Christian Heimes26532f72013-07-20 14:57:16 +020015040 if (PyType_Ready(&EncodingMapType) < 0)
15041 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015042
Benjamin Petersonc4311282012-10-30 23:21:10 -040015043 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15044 Py_FatalError("Can't initialize field name iterator type");
15045
15046 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15047 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015048
Victor Stinner3a50e702011-10-18 21:21:00 +020015049 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015050}
15051
15052/* Finalize the Unicode implementation */
15053
Christian Heimesa156e092008-02-16 07:38:31 +000015054int
15055PyUnicode_ClearFreeList(void)
15056{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015057 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015058}
15059
Guido van Rossumd57fd912000-03-10 22:53:23 +000015060void
Thomas Wouters78890102000-07-22 19:25:51 +000015061_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015062{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015063 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015064
Serhiy Storchaka05997252013-01-26 12:14:02 +020015065 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015066
Serhiy Storchaka05997252013-01-26 12:14:02 +020015067 for (i = 0; i < 256; i++)
15068 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015069 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015070 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015071}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015072
Walter Dörwald16807132007-05-25 13:52:07 +000015073void
15074PyUnicode_InternInPlace(PyObject **p)
15075{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015076 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015077 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015078#ifdef Py_DEBUG
15079 assert(s != NULL);
15080 assert(_PyUnicode_CHECK(s));
15081#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015082 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015083 return;
15084#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015085 /* If it's a subclass, we don't really know what putting
15086 it in the interned dict might do. */
15087 if (!PyUnicode_CheckExact(s))
15088 return;
15089 if (PyUnicode_CHECK_INTERNED(s))
15090 return;
15091 if (interned == NULL) {
15092 interned = PyDict_New();
15093 if (interned == NULL) {
15094 PyErr_Clear(); /* Don't leave an exception */
15095 return;
15096 }
15097 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015098 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015099 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015100 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015101 if (t == NULL) {
15102 PyErr_Clear();
15103 return;
15104 }
15105 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015106 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015107 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015108 return;
15109 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015110 /* The two references in interned are not counted by refcnt.
15111 The deallocator will take care of this */
15112 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015113 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015114}
15115
15116void
15117PyUnicode_InternImmortal(PyObject **p)
15118{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015119 PyUnicode_InternInPlace(p);
15120 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015121 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015122 Py_INCREF(*p);
15123 }
Walter Dörwald16807132007-05-25 13:52:07 +000015124}
15125
15126PyObject *
15127PyUnicode_InternFromString(const char *cp)
15128{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015129 PyObject *s = PyUnicode_FromString(cp);
15130 if (s == NULL)
15131 return NULL;
15132 PyUnicode_InternInPlace(&s);
15133 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015134}
15135
Alexander Belopolsky40018472011-02-26 01:02:56 +000015136void
15137_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015138{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015139 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015140 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015141 Py_ssize_t i, n;
15142 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015143
Benjamin Peterson14339b62009-01-31 16:36:08 +000015144 if (interned == NULL || !PyDict_Check(interned))
15145 return;
15146 keys = PyDict_Keys(interned);
15147 if (keys == NULL || !PyList_Check(keys)) {
15148 PyErr_Clear();
15149 return;
15150 }
Walter Dörwald16807132007-05-25 13:52:07 +000015151
Benjamin Peterson14339b62009-01-31 16:36:08 +000015152 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15153 detector, interned unicode strings are not forcibly deallocated;
15154 rather, we give them their stolen references back, and then clear
15155 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015156
Benjamin Peterson14339b62009-01-31 16:36:08 +000015157 n = PyList_GET_SIZE(keys);
15158 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015159 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015160 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015161 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015162 if (PyUnicode_READY(s) == -1) {
15163 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015164 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015165 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015166 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015167 case SSTATE_NOT_INTERNED:
15168 /* XXX Shouldn't happen */
15169 break;
15170 case SSTATE_INTERNED_IMMORTAL:
15171 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015172 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015173 break;
15174 case SSTATE_INTERNED_MORTAL:
15175 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015176 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015177 break;
15178 default:
15179 Py_FatalError("Inconsistent interned string state.");
15180 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015181 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015182 }
15183 fprintf(stderr, "total size of all interned strings: "
15184 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15185 "mortal/immortal\n", mortal_size, immortal_size);
15186 Py_DECREF(keys);
15187 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015188 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015189}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015190
15191
15192/********************* Unicode Iterator **************************/
15193
15194typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015195 PyObject_HEAD
15196 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015197 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015198} unicodeiterobject;
15199
15200static void
15201unicodeiter_dealloc(unicodeiterobject *it)
15202{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015203 _PyObject_GC_UNTRACK(it);
15204 Py_XDECREF(it->it_seq);
15205 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015206}
15207
15208static int
15209unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15210{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015211 Py_VISIT(it->it_seq);
15212 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015213}
15214
15215static PyObject *
15216unicodeiter_next(unicodeiterobject *it)
15217{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015218 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015219
Benjamin Peterson14339b62009-01-31 16:36:08 +000015220 assert(it != NULL);
15221 seq = it->it_seq;
15222 if (seq == NULL)
15223 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015224 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015226 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15227 int kind = PyUnicode_KIND(seq);
15228 void *data = PyUnicode_DATA(seq);
15229 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15230 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015231 if (item != NULL)
15232 ++it->it_index;
15233 return item;
15234 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015235
Benjamin Peterson14339b62009-01-31 16:36:08 +000015236 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015237 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015238 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015239}
15240
15241static PyObject *
15242unicodeiter_len(unicodeiterobject *it)
15243{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015244 Py_ssize_t len = 0;
15245 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015246 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015247 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015248}
15249
15250PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15251
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015252static PyObject *
15253unicodeiter_reduce(unicodeiterobject *it)
15254{
15255 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015256 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015257 it->it_seq, it->it_index);
15258 } else {
15259 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15260 if (u == NULL)
15261 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015262 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015263 }
15264}
15265
15266PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15267
15268static PyObject *
15269unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15270{
15271 Py_ssize_t index = PyLong_AsSsize_t(state);
15272 if (index == -1 && PyErr_Occurred())
15273 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015274 if (it->it_seq != NULL) {
15275 if (index < 0)
15276 index = 0;
15277 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15278 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15279 it->it_index = index;
15280 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015281 Py_RETURN_NONE;
15282}
15283
15284PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15285
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015286static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015287 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015288 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015289 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15290 reduce_doc},
15291 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15292 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015293 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015294};
15295
15296PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015297 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15298 "str_iterator", /* tp_name */
15299 sizeof(unicodeiterobject), /* tp_basicsize */
15300 0, /* tp_itemsize */
15301 /* methods */
15302 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15303 0, /* tp_print */
15304 0, /* tp_getattr */
15305 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015306 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015307 0, /* tp_repr */
15308 0, /* tp_as_number */
15309 0, /* tp_as_sequence */
15310 0, /* tp_as_mapping */
15311 0, /* tp_hash */
15312 0, /* tp_call */
15313 0, /* tp_str */
15314 PyObject_GenericGetAttr, /* tp_getattro */
15315 0, /* tp_setattro */
15316 0, /* tp_as_buffer */
15317 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15318 0, /* tp_doc */
15319 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15320 0, /* tp_clear */
15321 0, /* tp_richcompare */
15322 0, /* tp_weaklistoffset */
15323 PyObject_SelfIter, /* tp_iter */
15324 (iternextfunc)unicodeiter_next, /* tp_iternext */
15325 unicodeiter_methods, /* tp_methods */
15326 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015327};
15328
15329static PyObject *
15330unicode_iter(PyObject *seq)
15331{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015332 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015333
Benjamin Peterson14339b62009-01-31 16:36:08 +000015334 if (!PyUnicode_Check(seq)) {
15335 PyErr_BadInternalCall();
15336 return NULL;
15337 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015338 if (PyUnicode_READY(seq) == -1)
15339 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015340 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15341 if (it == NULL)
15342 return NULL;
15343 it->it_index = 0;
15344 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015345 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015346 _PyObject_GC_TRACK(it);
15347 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015348}
15349
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015350
15351size_t
15352Py_UNICODE_strlen(const Py_UNICODE *u)
15353{
15354 int res = 0;
15355 while(*u++)
15356 res++;
15357 return res;
15358}
15359
15360Py_UNICODE*
15361Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15362{
15363 Py_UNICODE *u = s1;
15364 while ((*u++ = *s2++));
15365 return s1;
15366}
15367
15368Py_UNICODE*
15369Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15370{
15371 Py_UNICODE *u = s1;
15372 while ((*u++ = *s2++))
15373 if (n-- == 0)
15374 break;
15375 return s1;
15376}
15377
15378Py_UNICODE*
15379Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15380{
15381 Py_UNICODE *u1 = s1;
15382 u1 += Py_UNICODE_strlen(u1);
15383 Py_UNICODE_strcpy(u1, s2);
15384 return s1;
15385}
15386
15387int
15388Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15389{
15390 while (*s1 && *s2 && *s1 == *s2)
15391 s1++, s2++;
15392 if (*s1 && *s2)
15393 return (*s1 < *s2) ? -1 : +1;
15394 if (*s1)
15395 return 1;
15396 if (*s2)
15397 return -1;
15398 return 0;
15399}
15400
15401int
15402Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15403{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015404 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015405 for (; n != 0; n--) {
15406 u1 = *s1;
15407 u2 = *s2;
15408 if (u1 != u2)
15409 return (u1 < u2) ? -1 : +1;
15410 if (u1 == '\0')
15411 return 0;
15412 s1++;
15413 s2++;
15414 }
15415 return 0;
15416}
15417
15418Py_UNICODE*
15419Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15420{
15421 const Py_UNICODE *p;
15422 for (p = s; *p; p++)
15423 if (*p == c)
15424 return (Py_UNICODE*)p;
15425 return NULL;
15426}
15427
15428Py_UNICODE*
15429Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15430{
15431 const Py_UNICODE *p;
15432 p = s + Py_UNICODE_strlen(s);
15433 while (p != s) {
15434 p--;
15435 if (*p == c)
15436 return (Py_UNICODE*)p;
15437 }
15438 return NULL;
15439}
Victor Stinner331ea922010-08-10 16:37:20 +000015440
Victor Stinner71133ff2010-09-01 23:43:53 +000015441Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015442PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015443{
Victor Stinner577db2c2011-10-11 22:12:48 +020015444 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015445 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015447 if (!PyUnicode_Check(unicode)) {
15448 PyErr_BadArgument();
15449 return NULL;
15450 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015451 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015452 if (u == NULL)
15453 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015454 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015455 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015456 PyErr_NoMemory();
15457 return NULL;
15458 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015459 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015460 size *= sizeof(Py_UNICODE);
15461 copy = PyMem_Malloc(size);
15462 if (copy == NULL) {
15463 PyErr_NoMemory();
15464 return NULL;
15465 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015466 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015467 return copy;
15468}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015469
Georg Brandl66c221e2010-10-14 07:04:07 +000015470/* A _string module, to export formatter_parser and formatter_field_name_split
15471 to the string.Formatter class implemented in Python. */
15472
15473static PyMethodDef _string_methods[] = {
15474 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15475 METH_O, PyDoc_STR("split the argument as a field name")},
15476 {"formatter_parser", (PyCFunction) formatter_parser,
15477 METH_O, PyDoc_STR("parse the argument as a format string")},
15478 {NULL, NULL}
15479};
15480
15481static struct PyModuleDef _string_module = {
15482 PyModuleDef_HEAD_INIT,
15483 "_string",
15484 PyDoc_STR("string helper module"),
15485 0,
15486 _string_methods,
15487 NULL,
15488 NULL,
15489 NULL,
15490 NULL
15491};
15492
15493PyMODINIT_FUNC
15494PyInit__string(void)
15495{
15496 return PyModule_Create(&_string_module);
15497}
15498
15499
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015500#ifdef __cplusplus
15501}
15502#endif