blob: 12d09f06f2d02b5573738388d45031b7470e668c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
207Py_LOCAL_INLINE(int)
208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200321 }
322 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200323 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 }
325 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200326 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200327 }
328 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200329 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200330 }
331 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200332 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200333 }
334 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200335 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 }
337 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200338 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200339 }
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_OTHER;
341}
342
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300343/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000345Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000346PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000348#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 /* This is actually an illegal character, so it should
352 not be passed to unichr. */
353 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354#endif
355}
356
Victor Stinner910337b2011-10-03 03:20:16 +0200357#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200358int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100359_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200360{
361 PyASCIIObject *ascii;
362 unsigned int kind;
363
364 assert(PyUnicode_Check(op));
365
366 ascii = (PyASCIIObject *)op;
367 kind = ascii->state.kind;
368
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200370 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 assert(ascii->state.ready == 1);
372 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200375 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200376
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 if (ascii->state.compact == 1) {
378 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200379 assert(kind == PyUnicode_1BYTE_KIND
380 || kind == PyUnicode_2BYTE_KIND
381 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200384 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100385 }
386 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389 data = unicode->data.any;
390 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100391 assert(ascii->length == 0);
392 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 assert(ascii->state.compact == 0);
394 assert(ascii->state.ascii == 0);
395 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200397 assert(ascii->wstr != NULL);
398 assert(data == NULL);
399 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 }
401 else {
402 assert(kind == PyUnicode_1BYTE_KIND
403 || kind == PyUnicode_2BYTE_KIND
404 || kind == PyUnicode_4BYTE_KIND);
405 assert(ascii->state.compact == 0);
406 assert(ascii->state.ready == 1);
407 assert(data != NULL);
408 if (ascii->state.ascii) {
409 assert (compact->utf8 == data);
410 assert (compact->utf8_length == ascii->length);
411 }
412 else
413 assert (compact->utf8 != data);
414 }
415 }
416 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200417 if (
418#if SIZEOF_WCHAR_T == 2
419 kind == PyUnicode_2BYTE_KIND
420#else
421 kind == PyUnicode_4BYTE_KIND
422#endif
423 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 {
425 assert(ascii->wstr == data);
426 assert(compact->wstr_length == ascii->length);
427 } else
428 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200429 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200430
431 if (compact->utf8 == NULL)
432 assert(compact->utf8_length == 0);
433 if (ascii->wstr == NULL)
434 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200435 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200436 /* check that the best kind is used */
437 if (check_content && kind != PyUnicode_WCHAR_KIND)
438 {
439 Py_ssize_t i;
440 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200441 void *data;
442 Py_UCS4 ch;
443
444 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 for (i=0; i < ascii->length; i++)
446 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200447 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200448 if (ch > maxchar)
449 maxchar = ch;
450 }
451 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100452 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 255);
455 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200456 else
457 assert(maxchar < 128);
458 }
Victor Stinner77faf692011-11-20 18:56:05 +0100459 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200460 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100461 assert(maxchar <= 0xFFFF);
462 }
463 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100465 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200467 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200468 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400469 return 1;
470}
Victor Stinner910337b2011-10-03 03:20:16 +0200471#endif
472
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100473static PyObject*
474unicode_result_wchar(PyObject *unicode)
475{
476#ifndef Py_DEBUG
477 Py_ssize_t len;
478
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100479 len = _PyUnicode_WSTR_LENGTH(unicode);
480 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200482 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100483 }
484
485 if (len == 1) {
486 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100487 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489 Py_DECREF(unicode);
490 return latin1_char;
491 }
492 }
493
494 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200495 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 return NULL;
497 }
498#else
Victor Stinneraa771272012-10-04 02:32:58 +0200499 assert(Py_REFCNT(unicode) == 1);
500
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 /* don't make the result ready in debug mode to ensure that the caller
502 makes the string ready before using it */
503 assert(_PyUnicode_CheckConsistency(unicode, 1));
504#endif
505 return unicode;
506}
507
508static PyObject*
509unicode_result_ready(PyObject *unicode)
510{
511 Py_ssize_t length;
512
513 length = PyUnicode_GET_LENGTH(unicode);
514 if (length == 0) {
515 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100516 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200517 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 }
519 return unicode_empty;
520 }
521
522 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200523 void *data = PyUnicode_DATA(unicode);
524 int kind = PyUnicode_KIND(unicode);
525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 if (ch < 256) {
527 PyObject *latin1_char = unicode_latin1[ch];
528 if (latin1_char != NULL) {
529 if (unicode != latin1_char) {
530 Py_INCREF(latin1_char);
531 Py_DECREF(unicode);
532 }
533 return latin1_char;
534 }
535 else {
536 assert(_PyUnicode_CheckConsistency(unicode, 1));
537 Py_INCREF(unicode);
538 unicode_latin1[ch] = unicode;
539 return unicode;
540 }
541 }
542 }
543
544 assert(_PyUnicode_CheckConsistency(unicode, 1));
545 return unicode;
546}
547
548static PyObject*
549unicode_result(PyObject *unicode)
550{
551 assert(_PyUnicode_CHECK(unicode));
552 if (PyUnicode_IS_READY(unicode))
553 return unicode_result_ready(unicode);
554 else
555 return unicode_result_wchar(unicode);
556}
557
Victor Stinnerc4b49542011-12-11 22:44:26 +0100558static PyObject*
559unicode_result_unchanged(PyObject *unicode)
560{
561 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500562 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563 return NULL;
564 Py_INCREF(unicode);
565 return unicode;
566 }
567 else
568 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100569 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100570}
571
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573 ASCII, Latin1, UTF-8, etc. */
574static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200575backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577{
Victor Stinnerad771582015-10-09 12:38:53 +0200578 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200579 Py_UCS4 ch;
580 enum PyUnicode_Kind kind;
581 void *data;
582
583 assert(PyUnicode_IS_READY(unicode));
584 kind = PyUnicode_KIND(unicode);
585 data = PyUnicode_DATA(unicode);
586
587 size = 0;
588 /* determine replacement size */
589 for (i = collstart; i < collend; ++i) {
590 Py_ssize_t incr;
591
592 ch = PyUnicode_READ(kind, data, i);
593 if (ch < 0x100)
594 incr = 2+2;
595 else if (ch < 0x10000)
596 incr = 2+4;
597 else {
598 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200599 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200600 }
601 if (size > PY_SSIZE_T_MAX - incr) {
602 PyErr_SetString(PyExc_OverflowError,
603 "encoded result is too long for a Python string");
604 return NULL;
605 }
606 size += incr;
607 }
608
Victor Stinnerad771582015-10-09 12:38:53 +0200609 str = _PyBytesWriter_Prepare(writer, str, size);
610 if (str == NULL)
611 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200612
613 /* generate replacement */
614 for (i = collstart; i < collend; ++i) {
615 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200616 *str++ = '\\';
617 if (ch >= 0x00010000) {
618 *str++ = 'U';
619 *str++ = Py_hexdigits[(ch>>28)&0xf];
620 *str++ = Py_hexdigits[(ch>>24)&0xf];
621 *str++ = Py_hexdigits[(ch>>20)&0xf];
622 *str++ = Py_hexdigits[(ch>>16)&0xf];
623 *str++ = Py_hexdigits[(ch>>12)&0xf];
624 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 }
Victor Stinner797485e2015-10-09 03:17:30 +0200626 else if (ch >= 0x100) {
627 *str++ = 'u';
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
630 }
631 else
632 *str++ = 'x';
633 *str++ = Py_hexdigits[(ch>>4)&0xf];
634 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200635 }
636 return str;
637}
638
639/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640 ASCII, Latin1, UTF-8, etc. */
641static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200642xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644{
Victor Stinnerad771582015-10-09 12:38:53 +0200645 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 Py_UCS4 ch;
647 enum PyUnicode_Kind kind;
648 void *data;
649
650 assert(PyUnicode_IS_READY(unicode));
651 kind = PyUnicode_KIND(unicode);
652 data = PyUnicode_DATA(unicode);
653
654 size = 0;
655 /* determine replacement size */
656 for (i = collstart; i < collend; ++i) {
657 Py_ssize_t incr;
658
659 ch = PyUnicode_READ(kind, data, i);
660 if (ch < 10)
661 incr = 2+1+1;
662 else if (ch < 100)
663 incr = 2+2+1;
664 else if (ch < 1000)
665 incr = 2+3+1;
666 else if (ch < 10000)
667 incr = 2+4+1;
668 else if (ch < 100000)
669 incr = 2+5+1;
670 else if (ch < 1000000)
671 incr = 2+6+1;
672 else {
673 assert(ch <= MAX_UNICODE);
674 incr = 2+7+1;
675 }
676 if (size > PY_SSIZE_T_MAX - incr) {
677 PyErr_SetString(PyExc_OverflowError,
678 "encoded result is too long for a Python string");
679 return NULL;
680 }
681 size += incr;
682 }
683
Victor Stinnerad771582015-10-09 12:38:53 +0200684 str = _PyBytesWriter_Prepare(writer, str, size);
685 if (str == NULL)
686 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200687
688 /* generate replacement */
689 for (i = collstart; i < collend; ++i) {
690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691 }
692 return str;
693}
694
Thomas Wouters477c8d52006-05-27 19:21:47 +0000695/* --- Bloom Filters ----------------------------------------------------- */
696
697/* stuff to implement simple "bloom filters" for Unicode characters.
698 to keep things simple, we use a single bitmask, using the least 5
699 bits from each unicode characters as the bit index. */
700
701/* the linebreak mask is set up by Unicode_Init below */
702
Antoine Pitrouf068f942010-01-13 14:19:12 +0000703#if LONG_BIT >= 128
704#define BLOOM_WIDTH 128
705#elif LONG_BIT >= 64
706#define BLOOM_WIDTH 64
707#elif LONG_BIT >= 32
708#define BLOOM_WIDTH 32
709#else
710#error "LONG_BIT is smaller than 32"
711#endif
712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713#define BLOOM_MASK unsigned long
714
Serhiy Storchaka05997252013-01-26 12:14:02 +0200715static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000716
Antoine Pitrouf068f942010-01-13 14:19:12 +0000717#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718
Benjamin Peterson29060642009-01-31 22:14:21 +0000719#define BLOOM_LINEBREAK(ch) \
720 ((ch) < 128U ? ascii_linebreak[(ch)] : \
721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000722
Alexander Belopolsky40018472011-02-26 01:02:56 +0000723Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000725{
Victor Stinnera85af502013-04-09 21:53:54 +0200726#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
727 do { \
728 TYPE *data = (TYPE *)PTR; \
729 TYPE *end = data + LEN; \
730 Py_UCS4 ch; \
731 for (; data != end; data++) { \
732 ch = *data; \
733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734 } \
735 break; \
736 } while (0)
737
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738 /* calculate simple bloom-style bitmask for a given unicode string */
739
Antoine Pitrouf068f942010-01-13 14:19:12 +0000740 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741
742 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200743 switch (kind) {
744 case PyUnicode_1BYTE_KIND:
745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746 break;
747 case PyUnicode_2BYTE_KIND:
748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749 break;
750 case PyUnicode_4BYTE_KIND:
751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752 break;
753 default:
754 assert(0);
755 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000756 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200757
758#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759}
760
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300761static int
762ensure_unicode(PyObject *obj)
763{
764 if (!PyUnicode_Check(obj)) {
765 PyErr_Format(PyExc_TypeError,
766 "must be str, not %.100s",
767 Py_TYPE(obj)->tp_name);
768 return -1;
769 }
770 return PyUnicode_READY(obj);
771}
772
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773/* Compilation of templated routines */
774
775#include "stringlib/asciilib.h"
776#include "stringlib/fastsearch.h"
777#include "stringlib/partition.h"
778#include "stringlib/split.h"
779#include "stringlib/count.h"
780#include "stringlib/find.h"
781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs1lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300791#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs2lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs4lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200818#include "stringlib/unicodedefs.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100822#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824/* --- Unicode Object ----------------------------------------------------- */
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200827fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200829Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200830 Py_ssize_t size, Py_UCS4 ch,
831 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200833 switch (kind) {
834 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200835 if ((Py_UCS1) ch != ch)
836 return -1;
837 if (direction > 0)
838 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
839 else
840 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200841 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200842 if ((Py_UCS2) ch != ch)
843 return -1;
844 if (direction > 0)
845 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
846 else
847 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200848 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200849 if (direction > 0)
850 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
851 else
852 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200853 default:
854 assert(0);
855 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200856 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200857}
858
Victor Stinnerafffce42012-10-03 23:03:17 +0200859#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000860/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200861 earlier.
862
863 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
864 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
865 invalid character in Unicode 6.0. */
866static void
867unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
868{
869 int kind = PyUnicode_KIND(unicode);
870 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
871 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
872 if (length <= old_length)
873 return;
874 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
875}
876#endif
877
Victor Stinnerfe226c02011-10-03 03:52:20 +0200878static PyObject*
879resize_compact(PyObject *unicode, Py_ssize_t length)
880{
881 Py_ssize_t char_size;
882 Py_ssize_t struct_size;
883 Py_ssize_t new_size;
884 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100885 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200886#ifdef Py_DEBUG
887 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
888#endif
889
Victor Stinner79891572012-05-03 13:43:07 +0200890 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200891 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100892 assert(PyUnicode_IS_COMPACT(unicode));
893
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200894 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100895 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200896 struct_size = sizeof(PyASCIIObject);
897 else
898 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200899 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200900
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
902 PyErr_NoMemory();
903 return NULL;
904 }
905 new_size = (struct_size + (length + 1) * char_size);
906
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200907 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
908 PyObject_DEL(_PyUnicode_UTF8(unicode));
909 _PyUnicode_UTF8(unicode) = NULL;
910 _PyUnicode_UTF8_LENGTH(unicode) = 0;
911 }
Victor Stinner84def372011-12-11 20:04:56 +0100912 _Py_DEC_REFTOTAL;
913 _Py_ForgetReference(unicode);
914
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300915 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100916 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100917 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918 PyErr_NoMemory();
919 return NULL;
920 }
Victor Stinner84def372011-12-11 20:04:56 +0100921 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200922 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100923
Victor Stinnerfe226c02011-10-03 03:52:20 +0200924 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200925 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200926 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100927 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200928 _PyUnicode_WSTR_LENGTH(unicode) = length;
929 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100930 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
931 PyObject_DEL(_PyUnicode_WSTR(unicode));
932 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100933 if (!PyUnicode_IS_ASCII(unicode))
934 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100935 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200936#ifdef Py_DEBUG
937 unicode_fill_invalid(unicode, old_length);
938#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
940 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200941 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200942 return unicode;
943}
944
Alexander Belopolsky40018472011-02-26 01:02:56 +0000945static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200946resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947{
Victor Stinner95663112011-10-04 01:03:50 +0200948 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100949 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200950 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200951 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000952
Victor Stinnerfe226c02011-10-03 03:52:20 +0200953 if (PyUnicode_IS_READY(unicode)) {
954 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200955 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200956 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200957#ifdef Py_DEBUG
958 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
959#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200960
961 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200962 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200963 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
964 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200965
966 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
967 PyErr_NoMemory();
968 return -1;
969 }
970 new_size = (length + 1) * char_size;
971
Victor Stinner7a9105a2011-12-12 00:13:42 +0100972 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
973 {
974 PyObject_DEL(_PyUnicode_UTF8(unicode));
975 _PyUnicode_UTF8(unicode) = NULL;
976 _PyUnicode_UTF8_LENGTH(unicode) = 0;
977 }
978
Victor Stinnerfe226c02011-10-03 03:52:20 +0200979 data = (PyObject *)PyObject_REALLOC(data, new_size);
980 if (data == NULL) {
981 PyErr_NoMemory();
982 return -1;
983 }
984 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200985 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200986 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200987 _PyUnicode_WSTR_LENGTH(unicode) = length;
988 }
989 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200990 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200991 _PyUnicode_UTF8_LENGTH(unicode) = length;
992 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200993 _PyUnicode_LENGTH(unicode) = length;
994 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200995#ifdef Py_DEBUG
996 unicode_fill_invalid(unicode, old_length);
997#endif
Victor Stinner95663112011-10-04 01:03:50 +0200998 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200999 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001000 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 }
Victor Stinner95663112011-10-04 01:03:50 +02001003 assert(_PyUnicode_WSTR(unicode) != NULL);
1004
1005 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001006 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001007 PyErr_NoMemory();
1008 return -1;
1009 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001010 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001011 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001012 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001013 if (!wstr) {
1014 PyErr_NoMemory();
1015 return -1;
1016 }
1017 _PyUnicode_WSTR(unicode) = wstr;
1018 _PyUnicode_WSTR(unicode)[length] = 0;
1019 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001020 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001021 return 0;
1022}
1023
Victor Stinnerfe226c02011-10-03 03:52:20 +02001024static PyObject*
1025resize_copy(PyObject *unicode, Py_ssize_t length)
1026{
1027 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001028 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001029 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001030
Benjamin Petersonbac79492012-01-14 13:34:47 -05001031 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001032 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001033
1034 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1035 if (copy == NULL)
1036 return NULL;
1037
1038 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001039 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001041 }
1042 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001043 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001044
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001045 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 if (w == NULL)
1047 return NULL;
1048 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1049 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001050 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1051 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001052 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 }
1054}
1055
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001057 Ux0000 terminated; some code (e.g. new_identifier)
1058 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059
1060 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001061 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001062
1063*/
1064
Alexander Belopolsky40018472011-02-26 01:02:56 +00001065static PyUnicodeObject *
1066_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001068 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001070
Thomas Wouters477c8d52006-05-27 19:21:47 +00001071 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072 if (length == 0 && unicode_empty != NULL) {
1073 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001074 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075 }
1076
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001077 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001078 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001079 return (PyUnicodeObject *)PyErr_NoMemory();
1080 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001081 if (length < 0) {
1082 PyErr_SetString(PyExc_SystemError,
1083 "Negative size passed to _PyUnicode_New");
1084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001085 }
1086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001087 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1088 if (unicode == NULL)
1089 return NULL;
1090 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001091
1092 _PyUnicode_WSTR_LENGTH(unicode) = length;
1093 _PyUnicode_HASH(unicode) = -1;
1094 _PyUnicode_STATE(unicode).interned = 0;
1095 _PyUnicode_STATE(unicode).kind = 0;
1096 _PyUnicode_STATE(unicode).compact = 0;
1097 _PyUnicode_STATE(unicode).ready = 0;
1098 _PyUnicode_STATE(unicode).ascii = 0;
1099 _PyUnicode_DATA_ANY(unicode) = NULL;
1100 _PyUnicode_LENGTH(unicode) = 0;
1101 _PyUnicode_UTF8(unicode) = NULL;
1102 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1105 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001106 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001107 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001108 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001109 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110
Jeremy Hyltond8082792003-09-16 19:41:39 +00001111 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001112 * the caller fails before initializing str -- unicode_resize()
1113 * reads str[0], and the Keep-Alive optimization can keep memory
1114 * allocated for str alive across a call to unicode_dealloc(unicode).
1115 * We don't want unicode_resize to read uninitialized memory in
1116 * that case.
1117 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 _PyUnicode_WSTR(unicode)[0] = 0;
1119 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001120
Victor Stinner7931d9a2011-11-04 00:22:48 +01001121 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001122 return unicode;
1123}
1124
Victor Stinnerf42dc442011-10-02 23:33:16 +02001125static const char*
1126unicode_kind_name(PyObject *unicode)
1127{
Victor Stinner42dfd712011-10-03 14:41:45 +02001128 /* don't check consistency: unicode_kind_name() is called from
1129 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001130 if (!PyUnicode_IS_COMPACT(unicode))
1131 {
1132 if (!PyUnicode_IS_READY(unicode))
1133 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001134 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001135 {
1136 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001137 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001138 return "legacy ascii";
1139 else
1140 return "legacy latin1";
1141 case PyUnicode_2BYTE_KIND:
1142 return "legacy UCS2";
1143 case PyUnicode_4BYTE_KIND:
1144 return "legacy UCS4";
1145 default:
1146 return "<legacy invalid kind>";
1147 }
1148 }
1149 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001150 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001152 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001153 return "ascii";
1154 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001155 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001156 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001157 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001158 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001159 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001160 default:
1161 return "<invalid compact kind>";
1162 }
1163}
1164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166/* Functions wrapping macros for use in debugger */
1167char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001168 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169}
1170
1171void *_PyUnicode_compact_data(void *unicode) {
1172 return _PyUnicode_COMPACT_DATA(unicode);
1173}
1174void *_PyUnicode_data(void *unicode){
1175 printf("obj %p\n", unicode);
1176 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1177 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1178 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1179 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1180 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1181 return PyUnicode_DATA(unicode);
1182}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001183
1184void
1185_PyUnicode_Dump(PyObject *op)
1186{
1187 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001188 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1189 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1190 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001191
Victor Stinnera849a4b2011-10-03 12:12:11 +02001192 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001193 {
1194 if (ascii->state.ascii)
1195 data = (ascii + 1);
1196 else
1197 data = (compact + 1);
1198 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001199 else
1200 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001201 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1202 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001203
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 if (ascii->wstr == data)
1205 printf("shared ");
1206 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001207
Victor Stinnera3b334d2011-10-03 13:53:37 +02001208 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001209 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001210 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1211 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001212 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1213 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001214 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001216}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217#endif
1218
1219PyObject *
1220PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1221{
1222 PyObject *obj;
1223 PyCompactUnicodeObject *unicode;
1224 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001225 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001226 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001227 Py_ssize_t char_size;
1228 Py_ssize_t struct_size;
1229
1230 /* Optimization for empty strings */
1231 if (size == 0 && unicode_empty != NULL) {
1232 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001233 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234 }
1235
Victor Stinner9e9d6892011-10-04 01:02:02 +02001236 is_ascii = 0;
1237 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 struct_size = sizeof(PyCompactUnicodeObject);
1239 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001240 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 char_size = 1;
1242 is_ascii = 1;
1243 struct_size = sizeof(PyASCIIObject);
1244 }
1245 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001246 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247 char_size = 1;
1248 }
1249 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001250 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 char_size = 2;
1252 if (sizeof(wchar_t) == 2)
1253 is_sharing = 1;
1254 }
1255 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001256 if (maxchar > MAX_UNICODE) {
1257 PyErr_SetString(PyExc_SystemError,
1258 "invalid maximum character passed to PyUnicode_New");
1259 return NULL;
1260 }
Victor Stinner8f825062012-04-27 13:55:39 +02001261 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262 char_size = 4;
1263 if (sizeof(wchar_t) == 4)
1264 is_sharing = 1;
1265 }
1266
1267 /* Ensure we won't overflow the size. */
1268 if (size < 0) {
1269 PyErr_SetString(PyExc_SystemError,
1270 "Negative size passed to PyUnicode_New");
1271 return NULL;
1272 }
1273 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1274 return PyErr_NoMemory();
1275
1276 /* Duplicated allocation code from _PyObject_New() instead of a call to
1277 * PyObject_New() so we are able to allocate space for the object and
1278 * it's data buffer.
1279 */
1280 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1281 if (obj == NULL)
1282 return PyErr_NoMemory();
1283 obj = PyObject_INIT(obj, &PyUnicode_Type);
1284 if (obj == NULL)
1285 return NULL;
1286
1287 unicode = (PyCompactUnicodeObject *)obj;
1288 if (is_ascii)
1289 data = ((PyASCIIObject*)obj) + 1;
1290 else
1291 data = unicode + 1;
1292 _PyUnicode_LENGTH(unicode) = size;
1293 _PyUnicode_HASH(unicode) = -1;
1294 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001295 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296 _PyUnicode_STATE(unicode).compact = 1;
1297 _PyUnicode_STATE(unicode).ready = 1;
1298 _PyUnicode_STATE(unicode).ascii = is_ascii;
1299 if (is_ascii) {
1300 ((char*)data)[size] = 0;
1301 _PyUnicode_WSTR(unicode) = NULL;
1302 }
Victor Stinner8f825062012-04-27 13:55:39 +02001303 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304 ((char*)data)[size] = 0;
1305 _PyUnicode_WSTR(unicode) = NULL;
1306 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001308 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 else {
1311 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001312 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001313 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001315 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 ((Py_UCS4*)data)[size] = 0;
1317 if (is_sharing) {
1318 _PyUnicode_WSTR_LENGTH(unicode) = size;
1319 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1320 }
1321 else {
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1323 _PyUnicode_WSTR(unicode) = NULL;
1324 }
1325 }
Victor Stinner8f825062012-04-27 13:55:39 +02001326#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001327 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001328#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001329 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 return obj;
1331}
1332
1333#if SIZEOF_WCHAR_T == 2
1334/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1335 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001336 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337
1338 This function assumes that unicode can hold one more code point than wstr
1339 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001340static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001342 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343{
1344 const wchar_t *iter;
1345 Py_UCS4 *ucs4_out;
1346
Victor Stinner910337b2011-10-03 03:20:16 +02001347 assert(unicode != NULL);
1348 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1350 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1351
1352 for (iter = begin; iter < end; ) {
1353 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1354 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001355 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1356 && (iter+1) < end
1357 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358 {
Victor Stinner551ac952011-11-29 22:58:13 +01001359 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360 iter += 2;
1361 }
1362 else {
1363 *ucs4_out++ = *iter;
1364 iter++;
1365 }
1366 }
1367 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1368 _PyUnicode_GET_LENGTH(unicode)));
1369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370}
1371#endif
1372
Victor Stinnercd9950f2011-10-02 00:34:53 +02001373static int
Victor Stinner488fa492011-12-12 00:01:39 +01001374unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001375{
Victor Stinner488fa492011-12-12 00:01:39 +01001376 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001377 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001378 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001379 return -1;
1380 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001381 return 0;
1382}
1383
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001384static int
1385_copy_characters(PyObject *to, Py_ssize_t to_start,
1386 PyObject *from, Py_ssize_t from_start,
1387 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001389 unsigned int from_kind, to_kind;
1390 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391
Victor Stinneree4544c2012-05-09 22:24:08 +02001392 assert(0 <= how_many);
1393 assert(0 <= from_start);
1394 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001395 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001397 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398
Victor Stinnerd3f08822012-05-29 12:57:52 +02001399 assert(PyUnicode_Check(to));
1400 assert(PyUnicode_IS_READY(to));
1401 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1402
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001403 if (how_many == 0)
1404 return 0;
1405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001407 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001409 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410
Victor Stinnerf1852262012-06-16 16:38:26 +02001411#ifdef Py_DEBUG
1412 if (!check_maxchar
1413 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1414 {
1415 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1416 Py_UCS4 ch;
1417 Py_ssize_t i;
1418 for (i=0; i < how_many; i++) {
1419 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1420 assert(ch <= to_maxchar);
1421 }
1422 }
1423#endif
1424
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001425 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001426 if (check_maxchar
1427 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1428 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001429 /* Writing Latin-1 characters into an ASCII string requires to
1430 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001431 Py_UCS4 max_char;
1432 max_char = ucs1lib_find_max_char(from_data,
1433 (Py_UCS1*)from_data + how_many);
1434 if (max_char >= 128)
1435 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001436 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001437 Py_MEMCPY((char*)to_data + to_kind * to_start,
1438 (char*)from_data + from_kind * from_start,
1439 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001441 else if (from_kind == PyUnicode_1BYTE_KIND
1442 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001443 {
1444 _PyUnicode_CONVERT_BYTES(
1445 Py_UCS1, Py_UCS2,
1446 PyUnicode_1BYTE_DATA(from) + from_start,
1447 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1448 PyUnicode_2BYTE_DATA(to) + to_start
1449 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001450 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001451 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001452 && to_kind == PyUnicode_4BYTE_KIND)
1453 {
1454 _PyUnicode_CONVERT_BYTES(
1455 Py_UCS1, Py_UCS4,
1456 PyUnicode_1BYTE_DATA(from) + from_start,
1457 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1458 PyUnicode_4BYTE_DATA(to) + to_start
1459 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001460 }
1461 else if (from_kind == PyUnicode_2BYTE_KIND
1462 && to_kind == PyUnicode_4BYTE_KIND)
1463 {
1464 _PyUnicode_CONVERT_BYTES(
1465 Py_UCS2, Py_UCS4,
1466 PyUnicode_2BYTE_DATA(from) + from_start,
1467 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1468 PyUnicode_4BYTE_DATA(to) + to_start
1469 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001470 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001471 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001472 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1473
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001474 if (!check_maxchar) {
1475 if (from_kind == PyUnicode_2BYTE_KIND
1476 && to_kind == PyUnicode_1BYTE_KIND)
1477 {
1478 _PyUnicode_CONVERT_BYTES(
1479 Py_UCS2, Py_UCS1,
1480 PyUnicode_2BYTE_DATA(from) + from_start,
1481 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1482 PyUnicode_1BYTE_DATA(to) + to_start
1483 );
1484 }
1485 else if (from_kind == PyUnicode_4BYTE_KIND
1486 && to_kind == PyUnicode_1BYTE_KIND)
1487 {
1488 _PyUnicode_CONVERT_BYTES(
1489 Py_UCS4, Py_UCS1,
1490 PyUnicode_4BYTE_DATA(from) + from_start,
1491 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1492 PyUnicode_1BYTE_DATA(to) + to_start
1493 );
1494 }
1495 else if (from_kind == PyUnicode_4BYTE_KIND
1496 && to_kind == PyUnicode_2BYTE_KIND)
1497 {
1498 _PyUnicode_CONVERT_BYTES(
1499 Py_UCS4, Py_UCS2,
1500 PyUnicode_4BYTE_DATA(from) + from_start,
1501 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1502 PyUnicode_2BYTE_DATA(to) + to_start
1503 );
1504 }
1505 else {
1506 assert(0);
1507 return -1;
1508 }
1509 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001510 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001511 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001512 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001513 Py_ssize_t i;
1514
Victor Stinnera0702ab2011-09-29 14:14:38 +02001515 for (i=0; i < how_many; i++) {
1516 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001517 if (ch > to_maxchar)
1518 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001519 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1520 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001521 }
1522 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001523 return 0;
1524}
1525
Victor Stinnerd3f08822012-05-29 12:57:52 +02001526void
1527_PyUnicode_FastCopyCharacters(
1528 PyObject *to, Py_ssize_t to_start,
1529 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001530{
1531 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1532}
1533
1534Py_ssize_t
1535PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1536 PyObject *from, Py_ssize_t from_start,
1537 Py_ssize_t how_many)
1538{
1539 int err;
1540
1541 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1542 PyErr_BadInternalCall();
1543 return -1;
1544 }
1545
Benjamin Petersonbac79492012-01-14 13:34:47 -05001546 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001547 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001548 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001549 return -1;
1550
Victor Stinnerd3f08822012-05-29 12:57:52 +02001551 if (from_start < 0) {
1552 PyErr_SetString(PyExc_IndexError, "string index out of range");
1553 return -1;
1554 }
1555 if (to_start < 0) {
1556 PyErr_SetString(PyExc_IndexError, "string index out of range");
1557 return -1;
1558 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001559 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1560 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1561 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001562 "Cannot write %zi characters at %zi "
1563 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001564 how_many, to_start, PyUnicode_GET_LENGTH(to));
1565 return -1;
1566 }
1567
1568 if (how_many == 0)
1569 return 0;
1570
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001572 return -1;
1573
1574 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1575 if (err) {
1576 PyErr_Format(PyExc_SystemError,
1577 "Cannot copy %s characters "
1578 "into a string of %s characters",
1579 unicode_kind_name(from),
1580 unicode_kind_name(to));
1581 return -1;
1582 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001583 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001584}
1585
Victor Stinner17222162011-09-28 22:15:37 +02001586/* Find the maximum code point and count the number of surrogate pairs so a
1587 correct string length can be computed before converting a string to UCS4.
1588 This function counts single surrogates as a character and not as a pair.
1589
1590 Return 0 on success, or -1 on error. */
1591static int
1592find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1593 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594{
1595 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001596 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597
Victor Stinnerc53be962011-10-02 21:33:54 +02001598 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599 *num_surrogates = 0;
1600 *maxchar = 0;
1601
1602 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001604 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1605 && (iter+1) < end
1606 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1607 {
1608 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1609 ++(*num_surrogates);
1610 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611 }
1612 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001613#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001614 {
1615 ch = *iter;
1616 iter++;
1617 }
1618 if (ch > *maxchar) {
1619 *maxchar = ch;
1620 if (*maxchar > MAX_UNICODE) {
1621 PyErr_Format(PyExc_ValueError,
1622 "character U+%x is not in range [U+0000; U+10ffff]",
1623 ch);
1624 return -1;
1625 }
1626 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001627 }
1628 return 0;
1629}
1630
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001631int
1632_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633{
1634 wchar_t *end;
1635 Py_UCS4 maxchar = 0;
1636 Py_ssize_t num_surrogates;
1637#if SIZEOF_WCHAR_T == 2
1638 Py_ssize_t length_wo_surrogates;
1639#endif
1640
Georg Brandl7597add2011-10-05 16:36:47 +02001641 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001642 strings were created using _PyObject_New() and where no canonical
1643 representation (the str field) has been set yet aka strings
1644 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001645 assert(_PyUnicode_CHECK(unicode));
1646 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001648 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001649 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001650 /* Actually, it should neither be interned nor be anything else: */
1651 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001654 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001655 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657
1658 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001659 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1660 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 PyErr_NoMemory();
1662 return -1;
1663 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001664 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001665 _PyUnicode_WSTR(unicode), end,
1666 PyUnicode_1BYTE_DATA(unicode));
1667 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1668 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1669 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1670 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001671 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001672 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001673 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001674 }
1675 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001676 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001677 _PyUnicode_UTF8(unicode) = NULL;
1678 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 }
1680 PyObject_FREE(_PyUnicode_WSTR(unicode));
1681 _PyUnicode_WSTR(unicode) = NULL;
1682 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1683 }
1684 /* In this case we might have to convert down from 4-byte native
1685 wchar_t to 2-byte unicode. */
1686 else if (maxchar < 65536) {
1687 assert(num_surrogates == 0 &&
1688 "FindMaxCharAndNumSurrogatePairs() messed up");
1689
Victor Stinner506f5922011-09-28 22:34:18 +02001690#if SIZEOF_WCHAR_T == 2
1691 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001693 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1694 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1695 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001696 _PyUnicode_UTF8(unicode) = NULL;
1697 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001698#else
1699 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001700 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001701 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001702 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001703 PyErr_NoMemory();
1704 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705 }
Victor Stinner506f5922011-09-28 22:34:18 +02001706 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1707 _PyUnicode_WSTR(unicode), end,
1708 PyUnicode_2BYTE_DATA(unicode));
1709 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1710 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1711 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001712 _PyUnicode_UTF8(unicode) = NULL;
1713 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001714 PyObject_FREE(_PyUnicode_WSTR(unicode));
1715 _PyUnicode_WSTR(unicode) = NULL;
1716 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1717#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 }
1719 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1720 else {
1721#if SIZEOF_WCHAR_T == 2
1722 /* in case the native representation is 2-bytes, we need to allocate a
1723 new normalized 4-byte version. */
1724 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001725 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1726 PyErr_NoMemory();
1727 return -1;
1728 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001729 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1730 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 PyErr_NoMemory();
1732 return -1;
1733 }
1734 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1735 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001736 _PyUnicode_UTF8(unicode) = NULL;
1737 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001738 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1739 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001740 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 PyObject_FREE(_PyUnicode_WSTR(unicode));
1742 _PyUnicode_WSTR(unicode) = NULL;
1743 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1744#else
1745 assert(num_surrogates == 0);
1746
Victor Stinnerc3c74152011-10-02 20:39:55 +02001747 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001749 _PyUnicode_UTF8(unicode) = NULL;
1750 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1752#endif
1753 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1754 }
1755 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001756 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 return 0;
1758}
1759
Alexander Belopolsky40018472011-02-26 01:02:56 +00001760static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001761unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762{
Walter Dörwald16807132007-05-25 13:52:07 +00001763 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001764 case SSTATE_NOT_INTERNED:
1765 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001766
Benjamin Peterson29060642009-01-31 22:14:21 +00001767 case SSTATE_INTERNED_MORTAL:
1768 /* revive dead object temporarily for DelItem */
1769 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001770 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001771 Py_FatalError(
1772 "deletion of interned string failed");
1773 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001774
Benjamin Peterson29060642009-01-31 22:14:21 +00001775 case SSTATE_INTERNED_IMMORTAL:
1776 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001777
Benjamin Peterson29060642009-01-31 22:14:21 +00001778 default:
1779 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001780 }
1781
Victor Stinner03490912011-10-03 23:45:12 +02001782 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001784 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001785 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001786 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1787 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001789 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790}
1791
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001792#ifdef Py_DEBUG
1793static int
1794unicode_is_singleton(PyObject *unicode)
1795{
1796 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1797 if (unicode == unicode_empty)
1798 return 1;
1799 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1800 {
1801 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1802 if (ch < 256 && unicode_latin1[ch] == unicode)
1803 return 1;
1804 }
1805 return 0;
1806}
1807#endif
1808
Alexander Belopolsky40018472011-02-26 01:02:56 +00001809static int
Victor Stinner488fa492011-12-12 00:01:39 +01001810unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001811{
Victor Stinner488fa492011-12-12 00:01:39 +01001812 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001813 if (Py_REFCNT(unicode) != 1)
1814 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001815 if (_PyUnicode_HASH(unicode) != -1)
1816 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001817 if (PyUnicode_CHECK_INTERNED(unicode))
1818 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001819 if (!PyUnicode_CheckExact(unicode))
1820 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001821#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001822 /* singleton refcount is greater than 1 */
1823 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001824#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001825 return 1;
1826}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001827
Victor Stinnerfe226c02011-10-03 03:52:20 +02001828static int
1829unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1830{
1831 PyObject *unicode;
1832 Py_ssize_t old_length;
1833
1834 assert(p_unicode != NULL);
1835 unicode = *p_unicode;
1836
1837 assert(unicode != NULL);
1838 assert(PyUnicode_Check(unicode));
1839 assert(0 <= length);
1840
Victor Stinner910337b2011-10-03 03:20:16 +02001841 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001842 old_length = PyUnicode_WSTR_LENGTH(unicode);
1843 else
1844 old_length = PyUnicode_GET_LENGTH(unicode);
1845 if (old_length == length)
1846 return 0;
1847
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001848 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001849 _Py_INCREF_UNICODE_EMPTY();
1850 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001851 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001852 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001853 return 0;
1854 }
1855
Victor Stinner488fa492011-12-12 00:01:39 +01001856 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001857 PyObject *copy = resize_copy(unicode, length);
1858 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001859 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001860 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001861 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001862 }
1863
Victor Stinnerfe226c02011-10-03 03:52:20 +02001864 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001865 PyObject *new_unicode = resize_compact(unicode, length);
1866 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001867 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001868 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001869 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001870 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001871 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001872}
1873
Alexander Belopolsky40018472011-02-26 01:02:56 +00001874int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001875PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001876{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001877 PyObject *unicode;
1878 if (p_unicode == NULL) {
1879 PyErr_BadInternalCall();
1880 return -1;
1881 }
1882 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001883 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001884 {
1885 PyErr_BadInternalCall();
1886 return -1;
1887 }
1888 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001889}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001890
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001891/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001892
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001893 WARNING: The function doesn't copy the terminating null character and
1894 doesn't check the maximum character (may write a latin1 character in an
1895 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001896static void
1897unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1898 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001899{
1900 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1901 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001902 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001903
1904 switch (kind) {
1905 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001906 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001907#ifdef Py_DEBUG
1908 if (PyUnicode_IS_ASCII(unicode)) {
1909 Py_UCS4 maxchar = ucs1lib_find_max_char(
1910 (const Py_UCS1*)str,
1911 (const Py_UCS1*)str + len);
1912 assert(maxchar < 128);
1913 }
1914#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001915 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001916 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001917 }
1918 case PyUnicode_2BYTE_KIND: {
1919 Py_UCS2 *start = (Py_UCS2 *)data + index;
1920 Py_UCS2 *ucs2 = start;
1921 assert(index <= PyUnicode_GET_LENGTH(unicode));
1922
Victor Stinner184252a2012-06-16 02:57:41 +02001923 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001924 *ucs2 = (Py_UCS2)*str;
1925
1926 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001927 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001928 }
1929 default: {
1930 Py_UCS4 *start = (Py_UCS4 *)data + index;
1931 Py_UCS4 *ucs4 = start;
1932 assert(kind == PyUnicode_4BYTE_KIND);
1933 assert(index <= PyUnicode_GET_LENGTH(unicode));
1934
Victor Stinner184252a2012-06-16 02:57:41 +02001935 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001936 *ucs4 = (Py_UCS4)*str;
1937
1938 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001939 }
1940 }
1941}
1942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943static PyObject*
1944get_latin1_char(unsigned char ch)
1945{
Victor Stinnera464fc12011-10-02 20:39:30 +02001946 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001948 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001949 if (!unicode)
1950 return NULL;
1951 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001952 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953 unicode_latin1[ch] = unicode;
1954 }
1955 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001956 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957}
1958
Victor Stinner985a82a2014-01-03 12:53:47 +01001959static PyObject*
1960unicode_char(Py_UCS4 ch)
1961{
1962 PyObject *unicode;
1963
1964 assert(ch <= MAX_UNICODE);
1965
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001966 if (ch < 256)
1967 return get_latin1_char(ch);
1968
Victor Stinner985a82a2014-01-03 12:53:47 +01001969 unicode = PyUnicode_New(1, ch);
1970 if (unicode == NULL)
1971 return NULL;
1972 switch (PyUnicode_KIND(unicode)) {
1973 case PyUnicode_1BYTE_KIND:
1974 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1975 break;
1976 case PyUnicode_2BYTE_KIND:
1977 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1978 break;
1979 default:
1980 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1981 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1982 }
1983 assert(_PyUnicode_CheckConsistency(unicode, 1));
1984 return unicode;
1985}
1986
Alexander Belopolsky40018472011-02-26 01:02:56 +00001987PyObject *
1988PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001990 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001991 Py_UCS4 maxchar = 0;
1992 Py_ssize_t num_surrogates;
1993
1994 if (u == NULL)
1995 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001997 /* If the Unicode data is known at construction time, we can apply
1998 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002001 if (size == 0)
2002 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 /* Single character Unicode objects in the Latin-1 range are
2005 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002006 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 return get_latin1_char((unsigned char)*u);
2008
2009 /* If not empty and not single character, copy the Unicode data
2010 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002011 if (find_maxchar_surrogates(u, u + size,
2012 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 return NULL;
2014
Victor Stinner8faf8212011-12-08 22:14:11 +01002015 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002016 if (!unicode)
2017 return NULL;
2018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 switch (PyUnicode_KIND(unicode)) {
2020 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002021 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2023 break;
2024 case PyUnicode_2BYTE_KIND:
2025#if Py_UNICODE_SIZE == 2
2026 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2027#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002028 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002029 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2030#endif
2031 break;
2032 case PyUnicode_4BYTE_KIND:
2033#if SIZEOF_WCHAR_T == 2
2034 /* This is the only case which has to process surrogates, thus
2035 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002036 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037#else
2038 assert(num_surrogates == 0);
2039 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2040#endif
2041 break;
2042 default:
2043 assert(0 && "Impossible state");
2044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002046 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047}
2048
Alexander Belopolsky40018472011-02-26 01:02:56 +00002049PyObject *
2050PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002051{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002052 if (size < 0) {
2053 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002054 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002055 return NULL;
2056 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002057 if (u != NULL)
2058 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2059 else
2060 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002061}
2062
Alexander Belopolsky40018472011-02-26 01:02:56 +00002063PyObject *
2064PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002065{
2066 size_t size = strlen(u);
2067 if (size > PY_SSIZE_T_MAX) {
2068 PyErr_SetString(PyExc_OverflowError, "input too long");
2069 return NULL;
2070 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002071 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002072}
2073
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002074PyObject *
2075_PyUnicode_FromId(_Py_Identifier *id)
2076{
2077 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002078 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2079 strlen(id->string),
2080 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002081 if (!id->object)
2082 return NULL;
2083 PyUnicode_InternInPlace(&id->object);
2084 assert(!id->next);
2085 id->next = static_strings;
2086 static_strings = id;
2087 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002088 return id->object;
2089}
2090
2091void
2092_PyUnicode_ClearStaticStrings()
2093{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002094 _Py_Identifier *tmp, *s = static_strings;
2095 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002096 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002097 tmp = s->next;
2098 s->next = NULL;
2099 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002100 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002101 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002102}
2103
Benjamin Peterson0df54292012-03-26 14:50:32 -04002104/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002105
Victor Stinnerd3f08822012-05-29 12:57:52 +02002106PyObject*
2107_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002108{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002109 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002110 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002111 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002112#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002113 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002114#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002115 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002116 }
Victor Stinner785938e2011-12-11 20:09:03 +01002117 unicode = PyUnicode_New(size, 127);
2118 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002119 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002120 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2121 assert(_PyUnicode_CheckConsistency(unicode, 1));
2122 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002123}
2124
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002125static Py_UCS4
2126kind_maxchar_limit(unsigned int kind)
2127{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002128 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002129 case PyUnicode_1BYTE_KIND:
2130 return 0x80;
2131 case PyUnicode_2BYTE_KIND:
2132 return 0x100;
2133 case PyUnicode_4BYTE_KIND:
2134 return 0x10000;
2135 default:
2136 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002137 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002138 }
2139}
2140
Victor Stinnere6abb482012-05-02 01:15:40 +02002141Py_LOCAL_INLINE(Py_UCS4)
2142align_maxchar(Py_UCS4 maxchar)
2143{
2144 if (maxchar <= 127)
2145 return 127;
2146 else if (maxchar <= 255)
2147 return 255;
2148 else if (maxchar <= 65535)
2149 return 65535;
2150 else
2151 return MAX_UNICODE;
2152}
2153
Victor Stinner702c7342011-10-05 13:50:52 +02002154static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002155_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002156{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002158 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002159
Serhiy Storchaka678db842013-01-26 12:16:36 +02002160 if (size == 0)
2161 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002162 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002163 if (size == 1)
2164 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002165
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002166 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002167 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 if (!res)
2169 return NULL;
2170 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002171 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002173}
2174
Victor Stinnere57b1c02011-09-28 22:20:48 +02002175static PyObject*
2176_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177{
2178 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002179 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002180
Serhiy Storchaka678db842013-01-26 12:16:36 +02002181 if (size == 0)
2182 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002183 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002184 if (size == 1)
2185 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002186
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002187 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002188 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189 if (!res)
2190 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002191 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002193 else {
2194 _PyUnicode_CONVERT_BYTES(
2195 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2196 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002197 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 return res;
2199}
2200
Victor Stinnere57b1c02011-09-28 22:20:48 +02002201static PyObject*
2202_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203{
2204 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002205 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002206
Serhiy Storchaka678db842013-01-26 12:16:36 +02002207 if (size == 0)
2208 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002209 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002210 if (size == 1)
2211 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002212
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002213 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002214 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 if (!res)
2216 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002217 if (max_char < 256)
2218 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2219 PyUnicode_1BYTE_DATA(res));
2220 else if (max_char < 0x10000)
2221 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2222 PyUnicode_2BYTE_DATA(res));
2223 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002225 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 return res;
2227}
2228
2229PyObject*
2230PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2231{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002232 if (size < 0) {
2233 PyErr_SetString(PyExc_ValueError, "size must be positive");
2234 return NULL;
2235 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002236 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002238 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002240 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002242 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002243 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002244 PyErr_SetString(PyExc_SystemError, "invalid kind");
2245 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247}
2248
Victor Stinnerece58de2012-04-23 23:36:38 +02002249Py_UCS4
2250_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2251{
2252 enum PyUnicode_Kind kind;
2253 void *startptr, *endptr;
2254
2255 assert(PyUnicode_IS_READY(unicode));
2256 assert(0 <= start);
2257 assert(end <= PyUnicode_GET_LENGTH(unicode));
2258 assert(start <= end);
2259
2260 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2261 return PyUnicode_MAX_CHAR_VALUE(unicode);
2262
2263 if (start == end)
2264 return 127;
2265
Victor Stinner94d558b2012-04-27 22:26:58 +02002266 if (PyUnicode_IS_ASCII(unicode))
2267 return 127;
2268
Victor Stinnerece58de2012-04-23 23:36:38 +02002269 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002270 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002271 endptr = (char *)startptr + end * kind;
2272 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002273 switch(kind) {
2274 case PyUnicode_1BYTE_KIND:
2275 return ucs1lib_find_max_char(startptr, endptr);
2276 case PyUnicode_2BYTE_KIND:
2277 return ucs2lib_find_max_char(startptr, endptr);
2278 case PyUnicode_4BYTE_KIND:
2279 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002280 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002281 assert(0);
2282 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002283 }
2284}
2285
Victor Stinner25a4b292011-10-06 12:31:55 +02002286/* Ensure that a string uses the most efficient storage, if it is not the
2287 case: create a new string with of the right kind. Write NULL into *p_unicode
2288 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002289static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002290unicode_adjust_maxchar(PyObject **p_unicode)
2291{
2292 PyObject *unicode, *copy;
2293 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002294 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002295 unsigned int kind;
2296
2297 assert(p_unicode != NULL);
2298 unicode = *p_unicode;
2299 assert(PyUnicode_IS_READY(unicode));
2300 if (PyUnicode_IS_ASCII(unicode))
2301 return;
2302
2303 len = PyUnicode_GET_LENGTH(unicode);
2304 kind = PyUnicode_KIND(unicode);
2305 if (kind == PyUnicode_1BYTE_KIND) {
2306 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002307 max_char = ucs1lib_find_max_char(u, u + len);
2308 if (max_char >= 128)
2309 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002310 }
2311 else if (kind == PyUnicode_2BYTE_KIND) {
2312 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002313 max_char = ucs2lib_find_max_char(u, u + len);
2314 if (max_char >= 256)
2315 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002316 }
2317 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002318 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002319 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002320 max_char = ucs4lib_find_max_char(u, u + len);
2321 if (max_char >= 0x10000)
2322 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002323 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002324 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002325 if (copy != NULL)
2326 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002327 Py_DECREF(unicode);
2328 *p_unicode = copy;
2329}
2330
Victor Stinner034f6cf2011-09-30 02:26:44 +02002331PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002332_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002333{
Victor Stinner87af4f22011-11-21 23:03:47 +01002334 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002335 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002336
Victor Stinner034f6cf2011-09-30 02:26:44 +02002337 if (!PyUnicode_Check(unicode)) {
2338 PyErr_BadInternalCall();
2339 return NULL;
2340 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002341 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002342 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002343
Victor Stinner87af4f22011-11-21 23:03:47 +01002344 length = PyUnicode_GET_LENGTH(unicode);
2345 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002346 if (!copy)
2347 return NULL;
2348 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2349
Victor Stinner87af4f22011-11-21 23:03:47 +01002350 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2351 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002352 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002353 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002354}
2355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002356
Victor Stinnerbc603d12011-10-02 01:00:40 +02002357/* Widen Unicode objects to larger buffers. Don't write terminating null
2358 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002359
2360void*
2361_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2362{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002363 Py_ssize_t len;
2364 void *result;
2365 unsigned int skind;
2366
Benjamin Petersonbac79492012-01-14 13:34:47 -05002367 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002368 return NULL;
2369
2370 len = PyUnicode_GET_LENGTH(s);
2371 skind = PyUnicode_KIND(s);
2372 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002373 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002374 return NULL;
2375 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002376 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002377 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002378 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002379 if (!result)
2380 return PyErr_NoMemory();
2381 assert(skind == PyUnicode_1BYTE_KIND);
2382 _PyUnicode_CONVERT_BYTES(
2383 Py_UCS1, Py_UCS2,
2384 PyUnicode_1BYTE_DATA(s),
2385 PyUnicode_1BYTE_DATA(s) + len,
2386 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002388 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002389 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002390 if (!result)
2391 return PyErr_NoMemory();
2392 if (skind == PyUnicode_2BYTE_KIND) {
2393 _PyUnicode_CONVERT_BYTES(
2394 Py_UCS2, Py_UCS4,
2395 PyUnicode_2BYTE_DATA(s),
2396 PyUnicode_2BYTE_DATA(s) + len,
2397 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002399 else {
2400 assert(skind == PyUnicode_1BYTE_KIND);
2401 _PyUnicode_CONVERT_BYTES(
2402 Py_UCS1, Py_UCS4,
2403 PyUnicode_1BYTE_DATA(s),
2404 PyUnicode_1BYTE_DATA(s) + len,
2405 result);
2406 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002408 default:
2409 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002410 }
Victor Stinner01698042011-10-04 00:04:26 +02002411 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 return NULL;
2413}
2414
2415static Py_UCS4*
2416as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2417 int copy_null)
2418{
2419 int kind;
2420 void *data;
2421 Py_ssize_t len, targetlen;
2422 if (PyUnicode_READY(string) == -1)
2423 return NULL;
2424 kind = PyUnicode_KIND(string);
2425 data = PyUnicode_DATA(string);
2426 len = PyUnicode_GET_LENGTH(string);
2427 targetlen = len;
2428 if (copy_null)
2429 targetlen++;
2430 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002431 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 if (!target) {
2433 PyErr_NoMemory();
2434 return NULL;
2435 }
2436 }
2437 else {
2438 if (targetsize < targetlen) {
2439 PyErr_Format(PyExc_SystemError,
2440 "string is longer than the buffer");
2441 if (copy_null && 0 < targetsize)
2442 target[0] = 0;
2443 return NULL;
2444 }
2445 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002446 if (kind == PyUnicode_1BYTE_KIND) {
2447 Py_UCS1 *start = (Py_UCS1 *) data;
2448 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002450 else if (kind == PyUnicode_2BYTE_KIND) {
2451 Py_UCS2 *start = (Py_UCS2 *) data;
2452 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2453 }
2454 else {
2455 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 if (copy_null)
2459 target[len] = 0;
2460 return target;
2461}
2462
2463Py_UCS4*
2464PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2465 int copy_null)
2466{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002467 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 PyErr_BadInternalCall();
2469 return NULL;
2470 }
2471 return as_ucs4(string, target, targetsize, copy_null);
2472}
2473
2474Py_UCS4*
2475PyUnicode_AsUCS4Copy(PyObject *string)
2476{
2477 return as_ucs4(string, NULL, 0, 1);
2478}
2479
2480#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002481
Alexander Belopolsky40018472011-02-26 01:02:56 +00002482PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002483PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002487 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002488 PyErr_BadInternalCall();
2489 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 }
2491
Martin v. Löwis790465f2008-04-05 20:41:37 +00002492 if (size == -1) {
2493 size = wcslen(w);
2494 }
2495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497}
2498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002500
Victor Stinner15a11362012-10-06 23:48:20 +02002501/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002502 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2503 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2504#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002505
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002506static int
2507unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2508 Py_ssize_t width, Py_ssize_t precision)
2509{
2510 Py_ssize_t length, fill, arglen;
2511 Py_UCS4 maxchar;
2512
2513 if (PyUnicode_READY(str) == -1)
2514 return -1;
2515
2516 length = PyUnicode_GET_LENGTH(str);
2517 if ((precision == -1 || precision >= length)
2518 && width <= length)
2519 return _PyUnicodeWriter_WriteStr(writer, str);
2520
2521 if (precision != -1)
2522 length = Py_MIN(precision, length);
2523
2524 arglen = Py_MAX(length, width);
2525 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2526 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2527 else
2528 maxchar = writer->maxchar;
2529
2530 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2531 return -1;
2532
2533 if (width > length) {
2534 fill = width - length;
2535 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2536 return -1;
2537 writer->pos += fill;
2538 }
2539
2540 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2541 str, 0, length);
2542 writer->pos += length;
2543 return 0;
2544}
2545
2546static int
2547unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2548 Py_ssize_t width, Py_ssize_t precision)
2549{
2550 /* UTF-8 */
2551 Py_ssize_t length;
2552 PyObject *unicode;
2553 int res;
2554
2555 length = strlen(str);
2556 if (precision != -1)
2557 length = Py_MIN(length, precision);
2558 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2559 if (unicode == NULL)
2560 return -1;
2561
2562 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2563 Py_DECREF(unicode);
2564 return res;
2565}
2566
Victor Stinner96865452011-03-01 23:44:09 +00002567static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002568unicode_fromformat_arg(_PyUnicodeWriter *writer,
2569 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002570{
Victor Stinnere215d962012-10-06 23:03:36 +02002571 const char *p;
2572 Py_ssize_t len;
2573 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002574 Py_ssize_t width;
2575 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002576 int longflag;
2577 int longlongflag;
2578 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002579 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002580
2581 p = f;
2582 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002583 zeropad = 0;
2584 if (*f == '0') {
2585 zeropad = 1;
2586 f++;
2587 }
Victor Stinner96865452011-03-01 23:44:09 +00002588
2589 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 width = -1;
2591 if (Py_ISDIGIT((unsigned)*f)) {
2592 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002593 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002594 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002596 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002597 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002598 return NULL;
2599 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002600 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002601 f++;
2602 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002603 }
2604 precision = -1;
2605 if (*f == '.') {
2606 f++;
2607 if (Py_ISDIGIT((unsigned)*f)) {
2608 precision = (*f - '0');
2609 f++;
2610 while (Py_ISDIGIT((unsigned)*f)) {
2611 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2612 PyErr_SetString(PyExc_ValueError,
2613 "precision too big");
2614 return NULL;
2615 }
2616 precision = (precision * 10) + (*f - '0');
2617 f++;
2618 }
2619 }
Victor Stinner96865452011-03-01 23:44:09 +00002620 if (*f == '%') {
2621 /* "%.3%s" => f points to "3" */
2622 f--;
2623 }
2624 }
2625 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002626 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002627 f--;
2628 }
Victor Stinner96865452011-03-01 23:44:09 +00002629
2630 /* Handle %ld, %lu, %lld and %llu. */
2631 longflag = 0;
2632 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002633 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002634 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002635 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002636 longflag = 1;
2637 ++f;
2638 }
Victor Stinner96865452011-03-01 23:44:09 +00002639 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002640 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002641 longlongflag = 1;
2642 f += 2;
2643 }
Victor Stinner96865452011-03-01 23:44:09 +00002644 }
2645 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002646 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002647 size_tflag = 1;
2648 ++f;
2649 }
Victor Stinnere215d962012-10-06 23:03:36 +02002650
2651 if (f[1] == '\0')
2652 writer->overallocate = 0;
2653
2654 switch (*f) {
2655 case 'c':
2656 {
2657 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002658 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002659 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002660 "character argument not in range(0x110000)");
2661 return NULL;
2662 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002663 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002664 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002665 break;
2666 }
2667
2668 case 'i':
2669 case 'd':
2670 case 'u':
2671 case 'x':
2672 {
2673 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002674 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002675 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002676
2677 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002678 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002679 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002680 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002681 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002682 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002683 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002684 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002685 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002686 va_arg(*vargs, size_t));
2687 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002688 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002689 va_arg(*vargs, unsigned int));
2690 }
2691 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002692 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002693 }
2694 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002695 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002696 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002697 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002698 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002699 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002700 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002701 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002702 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002703 va_arg(*vargs, Py_ssize_t));
2704 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002705 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002706 va_arg(*vargs, int));
2707 }
2708 assert(len >= 0);
2709
Victor Stinnere215d962012-10-06 23:03:36 +02002710 if (precision < len)
2711 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002712
2713 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002714 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2715 return NULL;
2716
Victor Stinnere215d962012-10-06 23:03:36 +02002717 if (width > precision) {
2718 Py_UCS4 fillchar;
2719 fill = width - precision;
2720 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002721 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2722 return NULL;
2723 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002724 }
Victor Stinner15a11362012-10-06 23:48:20 +02002725 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002726 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002727 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2728 return NULL;
2729 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002730 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002731
Victor Stinner4a587072013-11-19 12:54:53 +01002732 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2733 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002734 break;
2735 }
2736
2737 case 'p':
2738 {
2739 char number[MAX_LONG_LONG_CHARS];
2740
2741 len = sprintf(number, "%p", va_arg(*vargs, void*));
2742 assert(len >= 0);
2743
2744 /* %p is ill-defined: ensure leading 0x. */
2745 if (number[1] == 'X')
2746 number[1] = 'x';
2747 else if (number[1] != 'x') {
2748 memmove(number + 2, number,
2749 strlen(number) + 1);
2750 number[0] = '0';
2751 number[1] = 'x';
2752 len += 2;
2753 }
2754
Victor Stinner4a587072013-11-19 12:54:53 +01002755 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002756 return NULL;
2757 break;
2758 }
2759
2760 case 's':
2761 {
2762 /* UTF-8 */
2763 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002764 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002765 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002766 break;
2767 }
2768
2769 case 'U':
2770 {
2771 PyObject *obj = va_arg(*vargs, PyObject *);
2772 assert(obj && _PyUnicode_CHECK(obj));
2773
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002774 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002775 return NULL;
2776 break;
2777 }
2778
2779 case 'V':
2780 {
2781 PyObject *obj = va_arg(*vargs, PyObject *);
2782 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002783 if (obj) {
2784 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002785 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002786 return NULL;
2787 }
2788 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002789 assert(str != NULL);
2790 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002792 }
2793 break;
2794 }
2795
2796 case 'S':
2797 {
2798 PyObject *obj = va_arg(*vargs, PyObject *);
2799 PyObject *str;
2800 assert(obj);
2801 str = PyObject_Str(obj);
2802 if (!str)
2803 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002804 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002805 Py_DECREF(str);
2806 return NULL;
2807 }
2808 Py_DECREF(str);
2809 break;
2810 }
2811
2812 case 'R':
2813 {
2814 PyObject *obj = va_arg(*vargs, PyObject *);
2815 PyObject *repr;
2816 assert(obj);
2817 repr = PyObject_Repr(obj);
2818 if (!repr)
2819 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002820 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002821 Py_DECREF(repr);
2822 return NULL;
2823 }
2824 Py_DECREF(repr);
2825 break;
2826 }
2827
2828 case 'A':
2829 {
2830 PyObject *obj = va_arg(*vargs, PyObject *);
2831 PyObject *ascii;
2832 assert(obj);
2833 ascii = PyObject_ASCII(obj);
2834 if (!ascii)
2835 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002836 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002837 Py_DECREF(ascii);
2838 return NULL;
2839 }
2840 Py_DECREF(ascii);
2841 break;
2842 }
2843
2844 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002845 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002846 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002847 break;
2848
2849 default:
2850 /* if we stumble upon an unknown formatting code, copy the rest
2851 of the format string to the output string. (we cannot just
2852 skip the code, since there's no way to know what's in the
2853 argument list) */
2854 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002855 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002856 return NULL;
2857 f = p+len;
2858 return f;
2859 }
2860
2861 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002862 return f;
2863}
2864
Walter Dörwaldd2034312007-05-18 16:29:38 +00002865PyObject *
2866PyUnicode_FromFormatV(const char *format, va_list vargs)
2867{
Victor Stinnere215d962012-10-06 23:03:36 +02002868 va_list vargs2;
2869 const char *f;
2870 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002871
Victor Stinner8f674cc2013-04-17 23:02:17 +02002872 _PyUnicodeWriter_Init(&writer);
2873 writer.min_length = strlen(format) + 100;
2874 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002875
2876 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2877 Copy it to be able to pass a reference to a subfunction. */
2878 Py_VA_COPY(vargs2, vargs);
2879
2880 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002881 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002882 f = unicode_fromformat_arg(&writer, f, &vargs2);
2883 if (f == NULL)
2884 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002885 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002886 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002887 const char *p;
2888 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002889
Victor Stinnere215d962012-10-06 23:03:36 +02002890 p = f;
2891 do
2892 {
2893 if ((unsigned char)*p > 127) {
2894 PyErr_Format(PyExc_ValueError,
2895 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2896 "string, got a non-ASCII byte: 0x%02x",
2897 (unsigned char)*p);
2898 return NULL;
2899 }
2900 p++;
2901 }
2902 while (*p != '\0' && *p != '%');
2903 len = p - f;
2904
2905 if (*p == '\0')
2906 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002907
2908 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002909 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002910
2911 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002912 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002913 }
Victor Stinnere215d962012-10-06 23:03:36 +02002914 return _PyUnicodeWriter_Finish(&writer);
2915
2916 fail:
2917 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002918 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002919}
2920
Walter Dörwaldd2034312007-05-18 16:29:38 +00002921PyObject *
2922PyUnicode_FromFormat(const char *format, ...)
2923{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002924 PyObject* ret;
2925 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002926
2927#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002928 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002929#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002930 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002931#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002932 ret = PyUnicode_FromFormatV(format, vargs);
2933 va_end(vargs);
2934 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002935}
2936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002937#ifdef HAVE_WCHAR_H
2938
Victor Stinner5593d8a2010-10-02 11:11:27 +00002939/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2940 convert a Unicode object to a wide character string.
2941
Victor Stinnerd88d9832011-09-06 02:00:05 +02002942 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002943 character) required to convert the unicode object. Ignore size argument.
2944
Victor Stinnerd88d9832011-09-06 02:00:05 +02002945 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002946 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002947 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002948static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002949unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002950 wchar_t *w,
2951 Py_ssize_t size)
2952{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002953 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002954 const wchar_t *wstr;
2955
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002956 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002957 if (wstr == NULL)
2958 return -1;
2959
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002961 if (size > res)
2962 size = res + 1;
2963 else
2964 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002965 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002966 return res;
2967 }
2968 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002969 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002970}
2971
2972Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002973PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002974 wchar_t *w,
2975 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976{
2977 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 PyErr_BadInternalCall();
2979 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002981 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982}
2983
Victor Stinner137c34c2010-09-29 10:25:54 +00002984wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002985PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002986 Py_ssize_t *size)
2987{
2988 wchar_t* buffer;
2989 Py_ssize_t buflen;
2990
2991 if (unicode == NULL) {
2992 PyErr_BadInternalCall();
2993 return NULL;
2994 }
2995
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002996 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002997 if (buflen == -1)
2998 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002999 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003000 if (buffer == NULL) {
3001 PyErr_NoMemory();
3002 return NULL;
3003 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003004 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003005 if (buflen == -1) {
3006 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003007 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003008 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003009 if (size != NULL)
3010 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003011 return buffer;
3012}
3013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003014#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015
Alexander Belopolsky40018472011-02-26 01:02:56 +00003016PyObject *
3017PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003018{
Victor Stinner8faf8212011-12-08 22:14:11 +01003019 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003020 PyErr_SetString(PyExc_ValueError,
3021 "chr() arg not in range(0x110000)");
3022 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003023 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003024
Victor Stinner985a82a2014-01-03 12:53:47 +01003025 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003026}
3027
Alexander Belopolsky40018472011-02-26 01:02:56 +00003028PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003029PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003031 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003032 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003033 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003034 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003035 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003036 Py_INCREF(obj);
3037 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003038 }
3039 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003040 /* For a Unicode subtype that's not a Unicode object,
3041 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003042 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003043 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003044 PyErr_Format(PyExc_TypeError,
3045 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003046 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003047 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003048}
3049
Alexander Belopolsky40018472011-02-26 01:02:56 +00003050PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003051PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003052 const char *encoding,
3053 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003054{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003055 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003056 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003057
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003059 PyErr_BadInternalCall();
3060 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003062
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003063 /* Decoding bytes objects is the most common case and should be fast */
3064 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003065 if (PyBytes_GET_SIZE(obj) == 0)
3066 _Py_RETURN_UNICODE_EMPTY();
3067 v = PyUnicode_Decode(
3068 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3069 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003070 return v;
3071 }
3072
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003073 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003074 PyErr_SetString(PyExc_TypeError,
3075 "decoding str is not supported");
3076 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003077 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003078
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003079 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3080 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3081 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003082 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003083 Py_TYPE(obj)->tp_name);
3084 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003085 }
Tim Petersced69f82003-09-16 20:30:58 +00003086
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003087 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003088 PyBuffer_Release(&buffer);
3089 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003091
Serhiy Storchaka05997252013-01-26 12:14:02 +02003092 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003093 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003094 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095}
3096
Victor Stinner942889a2016-09-05 15:40:10 -07003097/* Normalize an encoding name: C implementation of
3098 encodings.normalize_encoding(). Return 1 on success, or 0 on error (encoding
3099 is longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003100int
3101_Py_normalize_encoding(const char *encoding,
3102 char *lower,
3103 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003105 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003106 char *l;
3107 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003108 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003109
Victor Stinner942889a2016-09-05 15:40:10 -07003110 assert(encoding != NULL);
3111
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003112 e = encoding;
3113 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003114 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003115 punct = 0;
3116 while (1) {
3117 char c = *e;
3118 if (c == 0) {
3119 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003120 }
Victor Stinner942889a2016-09-05 15:40:10 -07003121
3122 if (Py_ISALNUM(c) || c == '.') {
3123 if (punct && l != lower) {
3124 if (l == l_end) {
3125 return 0;
3126 }
3127 *l++ = '_';
3128 }
3129 punct = 0;
3130
3131 if (l == l_end) {
3132 return 0;
3133 }
3134 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003135 }
3136 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003137 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003138 }
Victor Stinner942889a2016-09-05 15:40:10 -07003139
3140 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003141 }
3142 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003143 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003144}
3145
Alexander Belopolsky40018472011-02-26 01:02:56 +00003146PyObject *
3147PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003148 Py_ssize_t size,
3149 const char *encoding,
3150 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003151{
3152 PyObject *buffer = NULL, *unicode;
3153 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003154 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3155
3156 if (encoding == NULL) {
3157 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3158 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003159
Fred Drakee4315f52000-05-09 19:53:39 +00003160 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003161 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3162 char *lower = buflower;
3163
3164 /* Fast paths */
3165 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3166 lower += 3;
3167 if (*lower == '_') {
3168 /* Match "utf8" and "utf_8" */
3169 lower++;
3170 }
3171
3172 if (lower[0] == '8' && lower[1] == 0) {
3173 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3174 }
3175 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3176 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3177 }
3178 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3179 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3180 }
3181 }
3182 else {
3183 if (strcmp(lower, "ascii") == 0
3184 || strcmp(lower, "us_ascii") == 0) {
3185 return PyUnicode_DecodeASCII(s, size, errors);
3186 }
3187 #ifdef HAVE_MBCS
3188 else if (strcmp(lower, "mbcs") == 0) {
3189 return PyUnicode_DecodeMBCS(s, size, errors);
3190 }
3191 #endif
3192 else if (strcmp(lower, "latin1") == 0
3193 || strcmp(lower, "latin_1") == 0
3194 || strcmp(lower, "iso_8859_1") == 0
3195 || strcmp(lower, "iso8859_1") == 0) {
3196 return PyUnicode_DecodeLatin1(s, size, errors);
3197 }
3198 }
Victor Stinner37296e82010-06-10 13:36:23 +00003199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200
3201 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003202 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003203 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003204 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003205 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 if (buffer == NULL)
3207 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003208 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 if (unicode == NULL)
3210 goto onError;
3211 if (!PyUnicode_Check(unicode)) {
3212 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003213 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3214 "use codecs.decode() to decode to arbitrary types",
3215 encoding,
3216 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217 Py_DECREF(unicode);
3218 goto onError;
3219 }
3220 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003221 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003222
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 Py_XDECREF(buffer);
3225 return NULL;
3226}
3227
Alexander Belopolsky40018472011-02-26 01:02:56 +00003228PyObject *
3229PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003230 const char *encoding,
3231 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003232{
3233 PyObject *v;
3234
3235 if (!PyUnicode_Check(unicode)) {
3236 PyErr_BadArgument();
3237 goto onError;
3238 }
3239
3240 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003241 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003242
3243 /* Decode via the codec registry */
3244 v = PyCodec_Decode(unicode, encoding, errors);
3245 if (v == NULL)
3246 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003247 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003248
Benjamin Peterson29060642009-01-31 22:14:21 +00003249 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003250 return NULL;
3251}
3252
Alexander Belopolsky40018472011-02-26 01:02:56 +00003253PyObject *
3254PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003255 const char *encoding,
3256 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003257{
3258 PyObject *v;
3259
3260 if (!PyUnicode_Check(unicode)) {
3261 PyErr_BadArgument();
3262 goto onError;
3263 }
3264
3265 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003266 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003267
3268 /* Decode via the codec registry */
3269 v = PyCodec_Decode(unicode, encoding, errors);
3270 if (v == NULL)
3271 goto onError;
3272 if (!PyUnicode_Check(v)) {
3273 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003274 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3275 "use codecs.decode() to decode to arbitrary types",
3276 encoding,
3277 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003278 Py_DECREF(v);
3279 goto onError;
3280 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003281 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003282
Benjamin Peterson29060642009-01-31 22:14:21 +00003283 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003284 return NULL;
3285}
3286
Alexander Belopolsky40018472011-02-26 01:02:56 +00003287PyObject *
3288PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003289 Py_ssize_t size,
3290 const char *encoding,
3291 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292{
3293 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003294
Guido van Rossumd57fd912000-03-10 22:53:23 +00003295 unicode = PyUnicode_FromUnicode(s, size);
3296 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003297 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3299 Py_DECREF(unicode);
3300 return v;
3301}
3302
Alexander Belopolsky40018472011-02-26 01:02:56 +00003303PyObject *
3304PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003305 const char *encoding,
3306 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003307{
3308 PyObject *v;
3309
3310 if (!PyUnicode_Check(unicode)) {
3311 PyErr_BadArgument();
3312 goto onError;
3313 }
3314
3315 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003317
3318 /* Encode via the codec registry */
3319 v = PyCodec_Encode(unicode, encoding, errors);
3320 if (v == NULL)
3321 goto onError;
3322 return v;
3323
Benjamin Peterson29060642009-01-31 22:14:21 +00003324 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003325 return NULL;
3326}
3327
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003328static size_t
3329wcstombs_errorpos(const wchar_t *wstr)
3330{
3331 size_t len;
3332#if SIZEOF_WCHAR_T == 2
3333 wchar_t buf[3];
3334#else
3335 wchar_t buf[2];
3336#endif
3337 char outbuf[MB_LEN_MAX];
3338 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003339
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003340#if SIZEOF_WCHAR_T == 2
3341 buf[2] = 0;
3342#else
3343 buf[1] = 0;
3344#endif
3345 start = wstr;
3346 while (*wstr != L'\0')
3347 {
3348 previous = wstr;
3349#if SIZEOF_WCHAR_T == 2
3350 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3351 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3352 {
3353 buf[0] = wstr[0];
3354 buf[1] = wstr[1];
3355 wstr += 2;
3356 }
3357 else {
3358 buf[0] = *wstr;
3359 buf[1] = 0;
3360 wstr++;
3361 }
3362#else
3363 buf[0] = *wstr;
3364 wstr++;
3365#endif
3366 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003367 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003368 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003369 }
3370
3371 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003372 return 0;
3373}
3374
Victor Stinner1b579672011-12-17 05:47:23 +01003375static int
3376locale_error_handler(const char *errors, int *surrogateescape)
3377{
Victor Stinner50149202015-09-22 00:26:54 +02003378 _Py_error_handler error_handler = get_error_handler(errors);
3379 switch (error_handler)
3380 {
3381 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003382 *surrogateescape = 0;
3383 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003384 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003385 *surrogateescape = 1;
3386 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003387 default:
3388 PyErr_Format(PyExc_ValueError,
3389 "only 'strict' and 'surrogateescape' error handlers "
3390 "are supported, not '%s'",
3391 errors);
3392 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003393 }
Victor Stinner1b579672011-12-17 05:47:23 +01003394}
3395
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003396PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003397PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003398{
3399 Py_ssize_t wlen, wlen2;
3400 wchar_t *wstr;
3401 PyObject *bytes = NULL;
3402 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003403 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003404 PyObject *exc;
3405 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003406 int surrogateescape;
3407
3408 if (locale_error_handler(errors, &surrogateescape) < 0)
3409 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003410
3411 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3412 if (wstr == NULL)
3413 return NULL;
3414
3415 wlen2 = wcslen(wstr);
3416 if (wlen2 != wlen) {
3417 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003418 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003419 return NULL;
3420 }
3421
3422 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003423 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003424 char *str;
3425
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003426 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003427 if (str == NULL) {
3428 if (error_pos == (size_t)-1) {
3429 PyErr_NoMemory();
3430 PyMem_Free(wstr);
3431 return NULL;
3432 }
3433 else {
3434 goto encode_error;
3435 }
3436 }
3437 PyMem_Free(wstr);
3438
3439 bytes = PyBytes_FromString(str);
3440 PyMem_Free(str);
3441 }
3442 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003443 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003444 size_t len, len2;
3445
3446 len = wcstombs(NULL, wstr, 0);
3447 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003448 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003449 goto encode_error;
3450 }
3451
3452 bytes = PyBytes_FromStringAndSize(NULL, len);
3453 if (bytes == NULL) {
3454 PyMem_Free(wstr);
3455 return NULL;
3456 }
3457
3458 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3459 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003460 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003461 goto encode_error;
3462 }
3463 PyMem_Free(wstr);
3464 }
3465 return bytes;
3466
3467encode_error:
3468 errmsg = strerror(errno);
3469 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003470
3471 if (error_pos == (size_t)-1)
3472 error_pos = wcstombs_errorpos(wstr);
3473
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003474 PyMem_Free(wstr);
3475 Py_XDECREF(bytes);
3476
Victor Stinner2f197072011-12-17 07:08:30 +01003477 if (errmsg != NULL) {
3478 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003479 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003480 if (wstr != NULL) {
3481 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003482 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003483 } else
3484 errmsg = NULL;
3485 }
3486 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003487 reason = PyUnicode_FromString(
3488 "wcstombs() encountered an unencodable "
3489 "wide character");
3490 if (reason == NULL)
3491 return NULL;
3492
3493 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3494 "locale", unicode,
3495 (Py_ssize_t)error_pos,
3496 (Py_ssize_t)(error_pos+1),
3497 reason);
3498 Py_DECREF(reason);
3499 if (exc != NULL) {
3500 PyCodec_StrictErrors(exc);
3501 Py_XDECREF(exc);
3502 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003503 return NULL;
3504}
3505
Victor Stinnerad158722010-10-27 00:25:46 +00003506PyObject *
3507PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003508{
Victor Stinner99b95382011-07-04 14:23:54 +02003509#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003510 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003511#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003512 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003513#else
Victor Stinner793b5312011-04-27 00:24:21 +02003514 PyInterpreterState *interp = PyThreadState_GET()->interp;
3515 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3516 cannot use it to encode and decode filenames before it is loaded. Load
3517 the Python codec requires to encode at least its own filename. Use the C
3518 version of the locale codec until the codec registry is initialized and
3519 the Python codec is loaded.
3520
3521 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3522 cannot only rely on it: check also interp->fscodec_initialized for
3523 subinterpreters. */
3524 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003525 return PyUnicode_AsEncodedString(unicode,
3526 Py_FileSystemDefaultEncoding,
3527 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003528 }
3529 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003530 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003531 }
Victor Stinnerad158722010-10-27 00:25:46 +00003532#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003533}
3534
Alexander Belopolsky40018472011-02-26 01:02:56 +00003535PyObject *
3536PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003537 const char *encoding,
3538 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539{
3540 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003541 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003542
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543 if (!PyUnicode_Check(unicode)) {
3544 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003545 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546 }
Fred Drakee4315f52000-05-09 19:53:39 +00003547
Victor Stinner942889a2016-09-05 15:40:10 -07003548 if (encoding == NULL) {
3549 return _PyUnicode_AsUTF8String(unicode, errors);
3550 }
3551
Fred Drakee4315f52000-05-09 19:53:39 +00003552 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003553 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3554 char *lower = buflower;
3555
3556 /* Fast paths */
3557 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3558 lower += 3;
3559 if (*lower == '_') {
3560 /* Match "utf8" and "utf_8" */
3561 lower++;
3562 }
3563
3564 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003565 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003566 }
3567 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3568 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3569 }
3570 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3571 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3572 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003573 }
Victor Stinner942889a2016-09-05 15:40:10 -07003574 else {
3575 if (strcmp(lower, "ascii") == 0
3576 || strcmp(lower, "us_ascii") == 0) {
3577 return _PyUnicode_AsASCIIString(unicode, errors);
3578 }
Victor Stinner99b95382011-07-04 14:23:54 +02003579#ifdef HAVE_MBCS
Victor Stinner942889a2016-09-05 15:40:10 -07003580 else if (strcmp(lower, "mbcs") == 0) {
3581 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3582 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003583#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003584 else if (strcmp(lower, "latin1") == 0 ||
3585 strcmp(lower, "latin_1") == 0 ||
3586 strcmp(lower, "iso_8859_1") == 0 ||
3587 strcmp(lower, "iso8859_1") == 0) {
3588 return _PyUnicode_AsLatin1String(unicode, errors);
3589 }
3590 }
Victor Stinner37296e82010-06-10 13:36:23 +00003591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592
3593 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003594 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003595 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003596 return NULL;
3597
3598 /* The normal path */
3599 if (PyBytes_Check(v))
3600 return v;
3601
3602 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003603 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003604 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003605 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003606
3607 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003608 "encoder %s returned bytearray instead of bytes; "
3609 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003610 encoding);
3611 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003612 Py_DECREF(v);
3613 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003614 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003615
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003616 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3617 Py_DECREF(v);
3618 return b;
3619 }
3620
3621 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003622 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3623 "use codecs.encode() to encode to arbitrary types",
3624 encoding,
3625 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003626 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003627 return NULL;
3628}
3629
Alexander Belopolsky40018472011-02-26 01:02:56 +00003630PyObject *
3631PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003632 const char *encoding,
3633 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003634{
3635 PyObject *v;
3636
3637 if (!PyUnicode_Check(unicode)) {
3638 PyErr_BadArgument();
3639 goto onError;
3640 }
3641
3642 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003643 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003644
3645 /* Encode via the codec registry */
3646 v = PyCodec_Encode(unicode, encoding, errors);
3647 if (v == NULL)
3648 goto onError;
3649 if (!PyUnicode_Check(v)) {
3650 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003651 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3652 "use codecs.encode() to encode to arbitrary types",
3653 encoding,
3654 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003655 Py_DECREF(v);
3656 goto onError;
3657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003659
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 return NULL;
3662}
3663
Victor Stinner2f197072011-12-17 07:08:30 +01003664static size_t
3665mbstowcs_errorpos(const char *str, size_t len)
3666{
3667#ifdef HAVE_MBRTOWC
3668 const char *start = str;
3669 mbstate_t mbs;
3670 size_t converted;
3671 wchar_t ch;
3672
3673 memset(&mbs, 0, sizeof mbs);
3674 while (len)
3675 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003676 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003677 if (converted == 0)
3678 /* Reached end of string */
3679 break;
3680 if (converted == (size_t)-1 || converted == (size_t)-2) {
3681 /* Conversion error or incomplete character */
3682 return str - start;
3683 }
3684 else {
3685 str += converted;
3686 len -= converted;
3687 }
3688 }
3689 /* failed to find the undecodable byte sequence */
3690 return 0;
3691#endif
3692 return 0;
3693}
3694
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003695PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003696PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003697 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003698{
3699 wchar_t smallbuf[256];
3700 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3701 wchar_t *wstr;
3702 size_t wlen, wlen2;
3703 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003704 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003705 size_t error_pos;
3706 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003707 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3708 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003709
3710 if (locale_error_handler(errors, &surrogateescape) < 0)
3711 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003712
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003713 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3714 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003715 return NULL;
3716 }
3717
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003718 if (surrogateescape) {
3719 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003720 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003721 if (wstr == NULL) {
3722 if (wlen == (size_t)-1)
3723 PyErr_NoMemory();
3724 else
3725 PyErr_SetFromErrno(PyExc_OSError);
3726 return NULL;
3727 }
3728
3729 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003730 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003731 }
3732 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003733 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003734#ifndef HAVE_BROKEN_MBSTOWCS
3735 wlen = mbstowcs(NULL, str, 0);
3736#else
3737 wlen = len;
3738#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003739 if (wlen == (size_t)-1)
3740 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003741 if (wlen+1 <= smallbuf_len) {
3742 wstr = smallbuf;
3743 }
3744 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003745 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003746 if (!wstr)
3747 return PyErr_NoMemory();
3748 }
3749
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003750 wlen2 = mbstowcs(wstr, str, wlen+1);
3751 if (wlen2 == (size_t)-1) {
3752 if (wstr != smallbuf)
3753 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003754 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003755 }
3756#ifdef HAVE_BROKEN_MBSTOWCS
3757 assert(wlen2 == wlen);
3758#endif
3759 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3760 if (wstr != smallbuf)
3761 PyMem_Free(wstr);
3762 }
3763 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003764
3765decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003766 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003767 errmsg = strerror(errno);
3768 assert(errmsg != NULL);
3769
3770 error_pos = mbstowcs_errorpos(str, len);
3771 if (errmsg != NULL) {
3772 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003773 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003774 if (wstr != NULL) {
3775 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003776 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003777 }
Victor Stinner2f197072011-12-17 07:08:30 +01003778 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003779 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003780 reason = PyUnicode_FromString(
3781 "mbstowcs() encountered an invalid multibyte sequence");
3782 if (reason == NULL)
3783 return NULL;
3784
3785 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3786 "locale", str, len,
3787 (Py_ssize_t)error_pos,
3788 (Py_ssize_t)(error_pos+1),
3789 reason);
3790 Py_DECREF(reason);
3791 if (exc != NULL) {
3792 PyCodec_StrictErrors(exc);
3793 Py_XDECREF(exc);
3794 }
3795 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003796}
3797
3798PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003799PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003800{
3801 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003802 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003803}
3804
3805
3806PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003807PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003808 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003809 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3810}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003811
Christian Heimes5894ba72007-11-04 11:43:14 +00003812PyObject*
3813PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3814{
Victor Stinner99b95382011-07-04 14:23:54 +02003815#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003816 return PyUnicode_DecodeMBCS(s, size, NULL);
3817#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003818 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003819#else
Victor Stinner793b5312011-04-27 00:24:21 +02003820 PyInterpreterState *interp = PyThreadState_GET()->interp;
3821 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3822 cannot use it to encode and decode filenames before it is loaded. Load
3823 the Python codec requires to encode at least its own filename. Use the C
3824 version of the locale codec until the codec registry is initialized and
3825 the Python codec is loaded.
3826
3827 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3828 cannot only rely on it: check also interp->fscodec_initialized for
3829 subinterpreters. */
3830 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003831 return PyUnicode_Decode(s, size,
3832 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003833 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003834 }
3835 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003836 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003837 }
Victor Stinnerad158722010-10-27 00:25:46 +00003838#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003839}
3840
Martin v. Löwis011e8422009-05-05 04:43:17 +00003841
3842int
3843PyUnicode_FSConverter(PyObject* arg, void* addr)
3844{
3845 PyObject *output = NULL;
3846 Py_ssize_t size;
3847 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003848 if (arg == NULL) {
3849 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003850 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003851 return 1;
3852 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003853 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003854 output = arg;
3855 Py_INCREF(output);
3856 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003857 else if (PyUnicode_Check(arg)) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003858 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003859 if (!output)
3860 return 0;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003861 assert(PyBytes_Check(output));
3862 }
3863 else {
3864 PyErr_Format(PyExc_TypeError,
3865 "must be str or bytes, not %.100s",
3866 Py_TYPE(arg)->tp_name);
3867 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003868 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003869 size = PyBytes_GET_SIZE(output);
3870 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003871 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003872 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003873 Py_DECREF(output);
3874 return 0;
3875 }
3876 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003877 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003878}
3879
3880
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003881int
3882PyUnicode_FSDecoder(PyObject* arg, void* addr)
3883{
3884 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003885 if (arg == NULL) {
3886 Py_DECREF(*(PyObject**)addr);
3887 return 1;
3888 }
3889 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003890 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003891 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003892 output = arg;
3893 Py_INCREF(output);
3894 }
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003895 else if (PyBytes_Check(arg) || PyObject_CheckBuffer(arg)) {
3896 if (!PyBytes_Check(arg) &&
3897 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3898 "path should be string or bytes, not %.200s",
3899 Py_TYPE(arg)->tp_name)) {
3900 return 0;
3901 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003902 arg = PyBytes_FromObject(arg);
3903 if (!arg)
3904 return 0;
3905 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3906 PyBytes_GET_SIZE(arg));
3907 Py_DECREF(arg);
3908 if (!output)
3909 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003910 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003911 else {
3912 PyErr_Format(PyExc_TypeError,
3913 "path should be string or bytes, not %.200s",
3914 Py_TYPE(arg)->tp_name);
3915 return 0;
3916 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003917 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003918 Py_DECREF(output);
3919 return 0;
3920 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003921 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003922 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003923 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003924 Py_DECREF(output);
3925 return 0;
3926 }
3927 *(PyObject**)addr = output;
3928 return Py_CLEANUP_SUPPORTED;
3929}
3930
3931
Martin v. Löwis5b222132007-06-10 09:51:05 +00003932char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003934{
Christian Heimesf3863112007-11-22 07:46:41 +00003935 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003936
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003937 if (!PyUnicode_Check(unicode)) {
3938 PyErr_BadArgument();
3939 return NULL;
3940 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003941 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003942 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003944 if (PyUnicode_UTF8(unicode) == NULL) {
3945 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003946 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947 if (bytes == NULL)
3948 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003949 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3950 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003951 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952 Py_DECREF(bytes);
3953 return NULL;
3954 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003955 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3956 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3957 PyBytes_AS_STRING(bytes),
3958 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003959 Py_DECREF(bytes);
3960 }
3961
3962 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003963 *psize = PyUnicode_UTF8_LENGTH(unicode);
3964 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003965}
3966
3967char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003968PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003970 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3971}
3972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973Py_UNICODE *
3974PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3975{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976 const unsigned char *one_byte;
3977#if SIZEOF_WCHAR_T == 4
3978 const Py_UCS2 *two_bytes;
3979#else
3980 const Py_UCS4 *four_bytes;
3981 const Py_UCS4 *ucs4_end;
3982 Py_ssize_t num_surrogates;
3983#endif
3984 wchar_t *w;
3985 wchar_t *wchar_end;
3986
3987 if (!PyUnicode_Check(unicode)) {
3988 PyErr_BadArgument();
3989 return NULL;
3990 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003991 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003993 assert(_PyUnicode_KIND(unicode) != 0);
3994 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003996 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003997#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003998 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3999 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004000 num_surrogates = 0;
4001
4002 for (; four_bytes < ucs4_end; ++four_bytes) {
4003 if (*four_bytes > 0xFFFF)
4004 ++num_surrogates;
4005 }
4006
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004007 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4008 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4009 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010 PyErr_NoMemory();
4011 return NULL;
4012 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004013 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004015 w = _PyUnicode_WSTR(unicode);
4016 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4017 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4019 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004020 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004021 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004022 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4023 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004024 }
4025 else
4026 *w = *four_bytes;
4027
4028 if (w > wchar_end) {
4029 assert(0 && "Miscalculated string end");
4030 }
4031 }
4032 *w = 0;
4033#else
4034 /* sizeof(wchar_t) == 4 */
4035 Py_FatalError("Impossible unicode object state, wstr and str "
4036 "should share memory already.");
4037 return NULL;
4038#endif
4039 }
4040 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004041 if ((size_t)_PyUnicode_LENGTH(unicode) >
4042 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4043 PyErr_NoMemory();
4044 return NULL;
4045 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004046 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4047 (_PyUnicode_LENGTH(unicode) + 1));
4048 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004049 PyErr_NoMemory();
4050 return NULL;
4051 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004052 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4053 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4054 w = _PyUnicode_WSTR(unicode);
4055 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004056
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004057 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4058 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004059 for (; w < wchar_end; ++one_byte, ++w)
4060 *w = *one_byte;
4061 /* null-terminate the wstr */
4062 *w = 0;
4063 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004064 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004065#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004066 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004067 for (; w < wchar_end; ++two_bytes, ++w)
4068 *w = *two_bytes;
4069 /* null-terminate the wstr */
4070 *w = 0;
4071#else
4072 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004073 PyObject_FREE(_PyUnicode_WSTR(unicode));
4074 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004075 Py_FatalError("Impossible unicode object state, wstr "
4076 "and str should share memory already.");
4077 return NULL;
4078#endif
4079 }
4080 else {
4081 assert(0 && "This should never happen.");
4082 }
4083 }
4084 }
4085 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004086 *size = PyUnicode_WSTR_LENGTH(unicode);
4087 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004088}
4089
Alexander Belopolsky40018472011-02-26 01:02:56 +00004090Py_UNICODE *
4091PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004093 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094}
4095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004096
Alexander Belopolsky40018472011-02-26 01:02:56 +00004097Py_ssize_t
4098PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099{
4100 if (!PyUnicode_Check(unicode)) {
4101 PyErr_BadArgument();
4102 goto onError;
4103 }
4104 return PyUnicode_GET_SIZE(unicode);
4105
Benjamin Peterson29060642009-01-31 22:14:21 +00004106 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 return -1;
4108}
4109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004110Py_ssize_t
4111PyUnicode_GetLength(PyObject *unicode)
4112{
Victor Stinner07621332012-06-16 04:53:46 +02004113 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004114 PyErr_BadArgument();
4115 return -1;
4116 }
Victor Stinner07621332012-06-16 04:53:46 +02004117 if (PyUnicode_READY(unicode) == -1)
4118 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004119 return PyUnicode_GET_LENGTH(unicode);
4120}
4121
4122Py_UCS4
4123PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4124{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004125 void *data;
4126 int kind;
4127
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004128 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4129 PyErr_BadArgument();
4130 return (Py_UCS4)-1;
4131 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004132 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004133 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004134 return (Py_UCS4)-1;
4135 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004136 data = PyUnicode_DATA(unicode);
4137 kind = PyUnicode_KIND(unicode);
4138 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004139}
4140
4141int
4142PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4143{
4144 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004145 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004146 return -1;
4147 }
Victor Stinner488fa492011-12-12 00:01:39 +01004148 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004149 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004150 PyErr_SetString(PyExc_IndexError, "string index out of range");
4151 return -1;
4152 }
Victor Stinner488fa492011-12-12 00:01:39 +01004153 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004154 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004155 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4156 PyErr_SetString(PyExc_ValueError, "character out of range");
4157 return -1;
4158 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004159 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4160 index, ch);
4161 return 0;
4162}
4163
Alexander Belopolsky40018472011-02-26 01:02:56 +00004164const char *
4165PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004166{
Victor Stinner42cb4622010-09-01 19:39:01 +00004167 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004168}
4169
Victor Stinner554f3f02010-06-16 23:33:54 +00004170/* create or adjust a UnicodeDecodeError */
4171static void
4172make_decode_exception(PyObject **exceptionObject,
4173 const char *encoding,
4174 const char *input, Py_ssize_t length,
4175 Py_ssize_t startpos, Py_ssize_t endpos,
4176 const char *reason)
4177{
4178 if (*exceptionObject == NULL) {
4179 *exceptionObject = PyUnicodeDecodeError_Create(
4180 encoding, input, length, startpos, endpos, reason);
4181 }
4182 else {
4183 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4184 goto onError;
4185 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4186 goto onError;
4187 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4188 goto onError;
4189 }
4190 return;
4191
4192onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004193 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004194}
4195
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004196#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197/* error handling callback helper:
4198 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004199 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200 and adjust various state variables.
4201 return 0 on success, -1 on error
4202*/
4203
Alexander Belopolsky40018472011-02-26 01:02:56 +00004204static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004205unicode_decode_call_errorhandler_wchar(
4206 const char *errors, PyObject **errorHandler,
4207 const char *encoding, const char *reason,
4208 const char **input, const char **inend, Py_ssize_t *startinpos,
4209 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4210 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004212 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213
4214 PyObject *restuple = NULL;
4215 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004216 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004217 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004218 Py_ssize_t requiredsize;
4219 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004220 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004221 wchar_t *repwstr;
4222 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004224 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4225 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004226
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004227 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004228 *errorHandler = PyCodec_LookupError(errors);
4229 if (*errorHandler == NULL)
4230 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004231 }
4232
Victor Stinner554f3f02010-06-16 23:33:54 +00004233 make_decode_exception(exceptionObject,
4234 encoding,
4235 *input, *inend - *input,
4236 *startinpos, *endinpos,
4237 reason);
4238 if (*exceptionObject == NULL)
4239 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240
4241 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4242 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004243 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004245 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247 }
4248 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004249 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004250
4251 /* Copy back the bytes variables, which might have been modified by the
4252 callback */
4253 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4254 if (!inputobj)
4255 goto onError;
4256 if (!PyBytes_Check(inputobj)) {
4257 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4258 }
4259 *input = PyBytes_AS_STRING(inputobj);
4260 insize = PyBytes_GET_SIZE(inputobj);
4261 *inend = *input + insize;
4262 /* we can DECREF safely, as the exception has another reference,
4263 so the object won't go away. */
4264 Py_DECREF(inputobj);
4265
4266 if (newpos<0)
4267 newpos = insize+newpos;
4268 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004269 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004270 goto onError;
4271 }
4272
4273 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4274 if (repwstr == NULL)
4275 goto onError;
4276 /* need more space? (at least enough for what we
4277 have+the replacement+the rest of the string (starting
4278 at the new input position), so we won't have to check space
4279 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004280 requiredsize = *outpos;
4281 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4282 goto overflow;
4283 requiredsize += repwlen;
4284 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4285 goto overflow;
4286 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004287 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004288 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004289 requiredsize = 2*outsize;
4290 if (unicode_resize(output, requiredsize) < 0)
4291 goto onError;
4292 }
4293 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4294 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004295 *endinpos = newpos;
4296 *inptr = *input + newpos;
4297
4298 /* we made it! */
4299 Py_XDECREF(restuple);
4300 return 0;
4301
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004302 overflow:
4303 PyErr_SetString(PyExc_OverflowError,
4304 "decoded result is too long for a Python string");
4305
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004306 onError:
4307 Py_XDECREF(restuple);
4308 return -1;
4309}
4310#endif /* HAVE_MBCS */
4311
4312static int
4313unicode_decode_call_errorhandler_writer(
4314 const char *errors, PyObject **errorHandler,
4315 const char *encoding, const char *reason,
4316 const char **input, const char **inend, Py_ssize_t *startinpos,
4317 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4318 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4319{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004320 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004321
4322 PyObject *restuple = NULL;
4323 PyObject *repunicode = NULL;
4324 Py_ssize_t insize;
4325 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004326 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004327 PyObject *inputobj = NULL;
4328
4329 if (*errorHandler == NULL) {
4330 *errorHandler = PyCodec_LookupError(errors);
4331 if (*errorHandler == NULL)
4332 goto onError;
4333 }
4334
4335 make_decode_exception(exceptionObject,
4336 encoding,
4337 *input, *inend - *input,
4338 *startinpos, *endinpos,
4339 reason);
4340 if (*exceptionObject == NULL)
4341 goto onError;
4342
4343 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4344 if (restuple == NULL)
4345 goto onError;
4346 if (!PyTuple_Check(restuple)) {
4347 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4348 goto onError;
4349 }
4350 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004351 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004352
4353 /* Copy back the bytes variables, which might have been modified by the
4354 callback */
4355 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4356 if (!inputobj)
4357 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004358 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004359 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004360 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004361 *input = PyBytes_AS_STRING(inputobj);
4362 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004363 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004364 /* we can DECREF safely, as the exception has another reference,
4365 so the object won't go away. */
4366 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004367
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004368 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004369 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004370 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004371 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004372 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004373 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004374
Victor Stinner8f674cc2013-04-17 23:02:17 +02004375 if (PyUnicode_READY(repunicode) < 0)
4376 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004377 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004378 if (replen > 1) {
4379 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004380 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004381 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4382 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4383 goto onError;
4384 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004385 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004386 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004387
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004388 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004389 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004392 Py_XDECREF(restuple);
4393 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004397 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398}
4399
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004400/* --- UTF-7 Codec -------------------------------------------------------- */
4401
Antoine Pitrou244651a2009-05-04 18:56:13 +00004402/* See RFC2152 for details. We encode conservatively and decode liberally. */
4403
4404/* Three simple macros defining base-64. */
4405
4406/* Is c a base-64 character? */
4407
4408#define IS_BASE64(c) \
4409 (((c) >= 'A' && (c) <= 'Z') || \
4410 ((c) >= 'a' && (c) <= 'z') || \
4411 ((c) >= '0' && (c) <= '9') || \
4412 (c) == '+' || (c) == '/')
4413
4414/* given that c is a base-64 character, what is its base-64 value? */
4415
4416#define FROM_BASE64(c) \
4417 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4418 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4419 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4420 (c) == '+' ? 62 : 63)
4421
4422/* What is the base-64 character of the bottom 6 bits of n? */
4423
4424#define TO_BASE64(n) \
4425 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4426
4427/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4428 * decoded as itself. We are permissive on decoding; the only ASCII
4429 * byte not decoding to itself is the + which begins a base64
4430 * string. */
4431
4432#define DECODE_DIRECT(c) \
4433 ((c) <= 127 && (c) != '+')
4434
4435/* The UTF-7 encoder treats ASCII characters differently according to
4436 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4437 * the above). See RFC2152. This array identifies these different
4438 * sets:
4439 * 0 : "Set D"
4440 * alphanumeric and '(),-./:?
4441 * 1 : "Set O"
4442 * !"#$%&*;<=>@[]^_`{|}
4443 * 2 : "whitespace"
4444 * ht nl cr sp
4445 * 3 : special (must be base64 encoded)
4446 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4447 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004448
Tim Petersced69f82003-09-16 20:30:58 +00004449static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004450char utf7_category[128] = {
4451/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4452 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4453/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4454 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4455/* sp ! " # $ % & ' ( ) * + , - . / */
4456 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4457/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4458 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4459/* @ A B C D E F G H I J K L M N O */
4460 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4461/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4462 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4463/* ` a b c d e f g h i j k l m n o */
4464 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4465/* p q r s t u v w x y z { | } ~ del */
4466 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004467};
4468
Antoine Pitrou244651a2009-05-04 18:56:13 +00004469/* ENCODE_DIRECT: this character should be encoded as itself. The
4470 * answer depends on whether we are encoding set O as itself, and also
4471 * on whether we are encoding whitespace as itself. RFC2152 makes it
4472 * clear that the answers to these questions vary between
4473 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004474
Antoine Pitrou244651a2009-05-04 18:56:13 +00004475#define ENCODE_DIRECT(c, directO, directWS) \
4476 ((c) < 128 && (c) > 0 && \
4477 ((utf7_category[(c)] == 0) || \
4478 (directWS && (utf7_category[(c)] == 2)) || \
4479 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480
Alexander Belopolsky40018472011-02-26 01:02:56 +00004481PyObject *
4482PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004483 Py_ssize_t size,
4484 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004485{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004486 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4487}
4488
Antoine Pitrou244651a2009-05-04 18:56:13 +00004489/* The decoder. The only state we preserve is our read position,
4490 * i.e. how many characters we have consumed. So if we end in the
4491 * middle of a shift sequence we have to back off the read position
4492 * and the output to the beginning of the sequence, otherwise we lose
4493 * all the shift state (seen bits, number of bits seen, high
4494 * surrogate). */
4495
Alexander Belopolsky40018472011-02-26 01:02:56 +00004496PyObject *
4497PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004498 Py_ssize_t size,
4499 const char *errors,
4500 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004501{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004502 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004503 Py_ssize_t startinpos;
4504 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004505 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004506 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004507 const char *errmsg = "";
4508 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004509 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510 unsigned int base64bits = 0;
4511 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004512 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513 PyObject *errorHandler = NULL;
4514 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004515
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004516 if (size == 0) {
4517 if (consumed)
4518 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004519 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004520 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004522 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004523 _PyUnicodeWriter_Init(&writer);
4524 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004525
4526 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004527 e = s + size;
4528
4529 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004530 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004532 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004533
Antoine Pitrou244651a2009-05-04 18:56:13 +00004534 if (inShift) { /* in a base-64 section */
4535 if (IS_BASE64(ch)) { /* consume a base-64 character */
4536 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4537 base64bits += 6;
4538 s++;
4539 if (base64bits >= 16) {
4540 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004541 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 base64bits -= 16;
4543 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004544 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 if (surrogate) {
4546 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004547 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4548 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004549 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004550 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004552 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 }
4554 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004555 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004556 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004557 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004558 }
4559 }
Victor Stinner551ac952011-11-29 22:58:13 +01004560 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 /* first surrogate */
4562 surrogate = outCh;
4563 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004565 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004566 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 }
4568 }
4569 }
4570 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004571 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 if (base64bits > 0) { /* left-over bits */
4573 if (base64bits >= 6) {
4574 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004575 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 errmsg = "partial character in shift sequence";
4577 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004578 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579 else {
4580 /* Some bits remain; they should be zero */
4581 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004582 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 errmsg = "non-zero padding bits in shift sequence";
4584 goto utf7Error;
4585 }
4586 }
4587 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004588 if (surrogate && DECODE_DIRECT(ch)) {
4589 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4590 goto onError;
4591 }
4592 surrogate = 0;
4593 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004594 /* '-' is absorbed; other terminating
4595 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004596 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004597 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598 }
4599 }
4600 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004601 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004602 s++; /* consume '+' */
4603 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004604 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004605 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004606 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004607 }
4608 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004609 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004610 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004611 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004613 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004614 }
4615 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004617 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004618 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004619 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004620 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004621 else {
4622 startinpos = s-starts;
4623 s++;
4624 errmsg = "unexpected special character";
4625 goto utf7Error;
4626 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004627 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004628utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004630 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004631 errors, &errorHandler,
4632 "utf7", errmsg,
4633 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004634 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004635 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004636 }
4637
Antoine Pitrou244651a2009-05-04 18:56:13 +00004638 /* end of string */
4639
4640 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4641 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004642 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004643 if (surrogate ||
4644 (base64bits >= 6) ||
4645 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004646 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004647 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004648 errors, &errorHandler,
4649 "utf7", "unterminated shift sequence",
4650 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004651 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004652 goto onError;
4653 if (s < e)
4654 goto restart;
4655 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657
4658 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004659 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004660 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004661 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004662 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004663 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004664 writer.kind, writer.data, shiftOutStart);
4665 Py_XDECREF(errorHandler);
4666 Py_XDECREF(exc);
4667 _PyUnicodeWriter_Dealloc(&writer);
4668 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004669 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004670 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004671 }
4672 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004673 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004675 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004676
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004677 Py_XDECREF(errorHandler);
4678 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004679 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004680
Benjamin Peterson29060642009-01-31 22:14:21 +00004681 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 Py_XDECREF(errorHandler);
4683 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004684 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004685 return NULL;
4686}
4687
4688
Alexander Belopolsky40018472011-02-26 01:02:56 +00004689PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004690_PyUnicode_EncodeUTF7(PyObject *str,
4691 int base64SetO,
4692 int base64WhiteSpace,
4693 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004694{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004695 int kind;
4696 void *data;
4697 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004698 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004699 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004700 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004701 unsigned int base64bits = 0;
4702 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004703 char * out;
4704 char * start;
4705
Benjamin Petersonbac79492012-01-14 13:34:47 -05004706 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004707 return NULL;
4708 kind = PyUnicode_KIND(str);
4709 data = PyUnicode_DATA(str);
4710 len = PyUnicode_GET_LENGTH(str);
4711
4712 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004713 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004714
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004715 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004716 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004717 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004718 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004719 if (v == NULL)
4720 return NULL;
4721
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004722 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004723 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004724 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004725
Antoine Pitrou244651a2009-05-04 18:56:13 +00004726 if (inShift) {
4727 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4728 /* shifting out */
4729 if (base64bits) { /* output remaining bits */
4730 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4731 base64buffer = 0;
4732 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004733 }
4734 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004735 /* Characters not in the BASE64 set implicitly unshift the sequence
4736 so no '-' is required, except if the character is itself a '-' */
4737 if (IS_BASE64(ch) || ch == '-') {
4738 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004739 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004740 *out++ = (char) ch;
4741 }
4742 else {
4743 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004744 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004745 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004746 else { /* not in a shift sequence */
4747 if (ch == '+') {
4748 *out++ = '+';
4749 *out++ = '-';
4750 }
4751 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4752 *out++ = (char) ch;
4753 }
4754 else {
4755 *out++ = '+';
4756 inShift = 1;
4757 goto encode_char;
4758 }
4759 }
4760 continue;
4761encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004762 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004763 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004764
Antoine Pitrou244651a2009-05-04 18:56:13 +00004765 /* code first surrogate */
4766 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004767 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004768 while (base64bits >= 6) {
4769 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4770 base64bits -= 6;
4771 }
4772 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004773 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004774 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004775 base64bits += 16;
4776 base64buffer = (base64buffer << 16) | ch;
4777 while (base64bits >= 6) {
4778 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4779 base64bits -= 6;
4780 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004781 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004782 if (base64bits)
4783 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4784 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004785 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004786 if (_PyBytes_Resize(&v, out - start) < 0)
4787 return NULL;
4788 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004789}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004790PyObject *
4791PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4792 Py_ssize_t size,
4793 int base64SetO,
4794 int base64WhiteSpace,
4795 const char *errors)
4796{
4797 PyObject *result;
4798 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4799 if (tmp == NULL)
4800 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004801 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004802 base64WhiteSpace, errors);
4803 Py_DECREF(tmp);
4804 return result;
4805}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004806
Antoine Pitrou244651a2009-05-04 18:56:13 +00004807#undef IS_BASE64
4808#undef FROM_BASE64
4809#undef TO_BASE64
4810#undef DECODE_DIRECT
4811#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004812
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813/* --- UTF-8 Codec -------------------------------------------------------- */
4814
Alexander Belopolsky40018472011-02-26 01:02:56 +00004815PyObject *
4816PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004817 Py_ssize_t size,
4818 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819{
Walter Dörwald69652032004-09-07 20:24:22 +00004820 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4821}
4822
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004823#include "stringlib/asciilib.h"
4824#include "stringlib/codecs.h"
4825#include "stringlib/undef.h"
4826
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004827#include "stringlib/ucs1lib.h"
4828#include "stringlib/codecs.h"
4829#include "stringlib/undef.h"
4830
4831#include "stringlib/ucs2lib.h"
4832#include "stringlib/codecs.h"
4833#include "stringlib/undef.h"
4834
4835#include "stringlib/ucs4lib.h"
4836#include "stringlib/codecs.h"
4837#include "stringlib/undef.h"
4838
Antoine Pitrouab868312009-01-10 15:40:25 +00004839/* Mask to quickly check whether a C 'long' contains a
4840 non-ASCII, UTF8-encoded char. */
4841#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004842# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004843#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004844# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004845#else
4846# error C 'long' size should be either 4 or 8!
4847#endif
4848
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849static Py_ssize_t
4850ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004851{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004852 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004853 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004854
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004855 /*
4856 * Issue #17237: m68k is a bit different from most architectures in
4857 * that objects do not use "natural alignment" - for example, int and
4858 * long are only aligned at 2-byte boundaries. Therefore the assert()
4859 * won't work; also, tests have shown that skipping the "optimised
4860 * version" will even speed up m68k.
4861 */
4862#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004863#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004864 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4865 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004866 /* Fast path, see in STRINGLIB(utf8_decode) for
4867 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004868 /* Help allocation */
4869 const char *_p = p;
4870 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004871 while (_p < aligned_end) {
4872 unsigned long value = *(const unsigned long *) _p;
4873 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004874 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004875 *((unsigned long *)q) = value;
4876 _p += SIZEOF_LONG;
4877 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004878 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004879 p = _p;
4880 while (p < end) {
4881 if ((unsigned char)*p & 0x80)
4882 break;
4883 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004885 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004888#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004889 while (p < end) {
4890 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4891 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004892 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004893 /* Help allocation */
4894 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004895 while (_p < aligned_end) {
4896 unsigned long value = *(unsigned long *) _p;
4897 if (value & ASCII_CHAR_MASK)
4898 break;
4899 _p += SIZEOF_LONG;
4900 }
4901 p = _p;
4902 if (_p == end)
4903 break;
4904 }
4905 if ((unsigned char)*p & 0x80)
4906 break;
4907 ++p;
4908 }
4909 memcpy(dest, start, p - start);
4910 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911}
Antoine Pitrouab868312009-01-10 15:40:25 +00004912
Victor Stinner785938e2011-12-11 20:09:03 +01004913PyObject *
4914PyUnicode_DecodeUTF8Stateful(const char *s,
4915 Py_ssize_t size,
4916 const char *errors,
4917 Py_ssize_t *consumed)
4918{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004919 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004920 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004921 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004922
4923 Py_ssize_t startinpos;
4924 Py_ssize_t endinpos;
4925 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004926 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004927 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004928 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004929
4930 if (size == 0) {
4931 if (consumed)
4932 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004933 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004934 }
4935
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004936 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4937 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004938 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004939 *consumed = 1;
4940 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004941 }
4942
Victor Stinner8f674cc2013-04-17 23:02:17 +02004943 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004944 writer.min_length = size;
4945 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004946 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004947
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004948 writer.pos = ascii_decode(s, end, writer.data);
4949 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004950 while (s < end) {
4951 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004952 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004953
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004954 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004955 if (PyUnicode_IS_ASCII(writer.buffer))
4956 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004957 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004958 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004959 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004960 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961 } else {
4962 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004963 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004964 }
4965
4966 switch (ch) {
4967 case 0:
4968 if (s == end || consumed)
4969 goto End;
4970 errmsg = "unexpected end of data";
4971 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004972 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004973 break;
4974 case 1:
4975 errmsg = "invalid start byte";
4976 startinpos = s - starts;
4977 endinpos = startinpos + 1;
4978 break;
4979 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004980 case 3:
4981 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004982 errmsg = "invalid continuation byte";
4983 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004984 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004985 break;
4986 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004987 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004988 goto onError;
4989 continue;
4990 }
4991
Victor Stinner1d65d912015-10-05 13:43:50 +02004992 if (error_handler == _Py_ERROR_UNKNOWN)
4993 error_handler = get_error_handler(errors);
4994
4995 switch (error_handler) {
4996 case _Py_ERROR_IGNORE:
4997 s += (endinpos - startinpos);
4998 break;
4999
5000 case _Py_ERROR_REPLACE:
5001 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5002 goto onError;
5003 s += (endinpos - startinpos);
5004 break;
5005
5006 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005007 {
5008 Py_ssize_t i;
5009
Victor Stinner1d65d912015-10-05 13:43:50 +02005010 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5011 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005012 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005013 ch = (Py_UCS4)(unsigned char)(starts[i]);
5014 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5015 ch + 0xdc00);
5016 writer.pos++;
5017 }
5018 s += (endinpos - startinpos);
5019 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005020 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005021
5022 default:
5023 if (unicode_decode_call_errorhandler_writer(
5024 errors, &error_handler_obj,
5025 "utf-8", errmsg,
5026 &starts, &end, &startinpos, &endinpos, &exc, &s,
5027 &writer))
5028 goto onError;
5029 }
Victor Stinner785938e2011-12-11 20:09:03 +01005030 }
5031
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005032End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005033 if (consumed)
5034 *consumed = s - starts;
5035
Victor Stinner1d65d912015-10-05 13:43:50 +02005036 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005037 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005038 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005039
5040onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005041 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005042 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005043 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005044 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005045}
5046
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005047#ifdef __APPLE__
5048
5049/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005050 used to decode the command line arguments on Mac OS X.
5051
5052 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005053 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005054
5055wchar_t*
5056_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5057{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005058 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005059 wchar_t *unicode;
5060 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005061
5062 /* Note: size will always be longer than the resulting Unicode
5063 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005064 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005065 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005066 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005067 if (!unicode)
5068 return NULL;
5069
5070 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005071 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005073 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005074 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005075#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005076 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005077#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005078 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005079#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005080 if (ch > 0xFF) {
5081#if SIZEOF_WCHAR_T == 4
5082 assert(0);
5083#else
5084 assert(Py_UNICODE_IS_SURROGATE(ch));
5085 /* compute and append the two surrogates: */
5086 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5087 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5088#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005089 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005090 else {
5091 if (!ch && s == e)
5092 break;
5093 /* surrogateescape */
5094 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5095 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005096 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005097 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005098 return unicode;
5099}
5100
5101#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005103/* Primary internal function which creates utf8 encoded bytes objects.
5104
5105 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005106 and allocate exactly as much space needed at the end. Else allocate the
5107 maximum possible needed (4 result bytes per Unicode character), and return
5108 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005109*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005110PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005111_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112{
Victor Stinner6099a032011-12-18 14:22:26 +01005113 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005114 void *data;
5115 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005117 if (!PyUnicode_Check(unicode)) {
5118 PyErr_BadArgument();
5119 return NULL;
5120 }
5121
5122 if (PyUnicode_READY(unicode) == -1)
5123 return NULL;
5124
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005125 if (PyUnicode_UTF8(unicode))
5126 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5127 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005128
5129 kind = PyUnicode_KIND(unicode);
5130 data = PyUnicode_DATA(unicode);
5131 size = PyUnicode_GET_LENGTH(unicode);
5132
Benjamin Petersonead6b532011-12-20 17:23:42 -06005133 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005134 default:
5135 assert(0);
5136 case PyUnicode_1BYTE_KIND:
5137 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5138 assert(!PyUnicode_IS_ASCII(unicode));
5139 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5140 case PyUnicode_2BYTE_KIND:
5141 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5142 case PyUnicode_4BYTE_KIND:
5143 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005144 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145}
5146
Alexander Belopolsky40018472011-02-26 01:02:56 +00005147PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005148PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5149 Py_ssize_t size,
5150 const char *errors)
5151{
5152 PyObject *v, *unicode;
5153
5154 unicode = PyUnicode_FromUnicode(s, size);
5155 if (unicode == NULL)
5156 return NULL;
5157 v = _PyUnicode_AsUTF8String(unicode, errors);
5158 Py_DECREF(unicode);
5159 return v;
5160}
5161
5162PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005163PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005165 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166}
5167
Walter Dörwald41980ca2007-08-16 21:55:45 +00005168/* --- UTF-32 Codec ------------------------------------------------------- */
5169
5170PyObject *
5171PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005172 Py_ssize_t size,
5173 const char *errors,
5174 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005175{
5176 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5177}
5178
5179PyObject *
5180PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 Py_ssize_t size,
5182 const char *errors,
5183 int *byteorder,
5184 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005185{
5186 const char *starts = s;
5187 Py_ssize_t startinpos;
5188 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005189 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005190 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005191 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005192 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005193 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005194 PyObject *errorHandler = NULL;
5195 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005196
Walter Dörwald41980ca2007-08-16 21:55:45 +00005197 q = (unsigned char *)s;
5198 e = q + size;
5199
5200 if (byteorder)
5201 bo = *byteorder;
5202
5203 /* Check for BOM marks (U+FEFF) in the input and adjust current
5204 byte order setting accordingly. In native mode, the leading BOM
5205 mark is skipped, in all other modes, it is copied to the output
5206 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005207 if (bo == 0 && size >= 4) {
5208 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5209 if (bom == 0x0000FEFF) {
5210 bo = -1;
5211 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005212 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005213 else if (bom == 0xFFFE0000) {
5214 bo = 1;
5215 q += 4;
5216 }
5217 if (byteorder)
5218 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005219 }
5220
Victor Stinnere64322e2012-10-30 23:12:47 +01005221 if (q == e) {
5222 if (consumed)
5223 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005224 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005225 }
5226
Victor Stinnere64322e2012-10-30 23:12:47 +01005227#ifdef WORDS_BIGENDIAN
5228 le = bo < 0;
5229#else
5230 le = bo <= 0;
5231#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005232 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005233
Victor Stinner8f674cc2013-04-17 23:02:17 +02005234 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005235 writer.min_length = (e - q + 3) / 4;
5236 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005237 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005238
Victor Stinnere64322e2012-10-30 23:12:47 +01005239 while (1) {
5240 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005241 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005242
Victor Stinnere64322e2012-10-30 23:12:47 +01005243 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005244 enum PyUnicode_Kind kind = writer.kind;
5245 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005246 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005247 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005248 if (le) {
5249 do {
5250 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5251 if (ch > maxch)
5252 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005253 if (kind != PyUnicode_1BYTE_KIND &&
5254 Py_UNICODE_IS_SURROGATE(ch))
5255 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005256 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005257 q += 4;
5258 } while (q <= last);
5259 }
5260 else {
5261 do {
5262 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5263 if (ch > maxch)
5264 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005265 if (kind != PyUnicode_1BYTE_KIND &&
5266 Py_UNICODE_IS_SURROGATE(ch))
5267 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005268 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005269 q += 4;
5270 } while (q <= last);
5271 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005272 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005273 }
5274
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005275 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005276 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005277 startinpos = ((const char *)q) - starts;
5278 endinpos = startinpos + 4;
5279 }
5280 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005281 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005282 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005283 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005285 startinpos = ((const char *)q) - starts;
5286 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005287 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005288 else {
5289 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005290 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005291 goto onError;
5292 q += 4;
5293 continue;
5294 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005295 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005296 startinpos = ((const char *)q) - starts;
5297 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005298 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005299
5300 /* The remaining input chars are ignored if the callback
5301 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005302 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005303 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005304 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005305 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005306 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005308 }
5309
Walter Dörwald41980ca2007-08-16 21:55:45 +00005310 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005312
Walter Dörwald41980ca2007-08-16 21:55:45 +00005313 Py_XDECREF(errorHandler);
5314 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005315 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005316
Benjamin Peterson29060642009-01-31 22:14:21 +00005317 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005318 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005319 Py_XDECREF(errorHandler);
5320 Py_XDECREF(exc);
5321 return NULL;
5322}
5323
5324PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005325_PyUnicode_EncodeUTF32(PyObject *str,
5326 const char *errors,
5327 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005328{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005329 enum PyUnicode_Kind kind;
5330 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005331 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005332 PyObject *v;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005333 PY_UINT32_T *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005334#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005335 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005336#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005337 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005338#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005339 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005340 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005341 PyObject *errorHandler = NULL;
5342 PyObject *exc = NULL;
5343 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005344
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005345 if (!PyUnicode_Check(str)) {
5346 PyErr_BadArgument();
5347 return NULL;
5348 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005349 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005350 return NULL;
5351 kind = PyUnicode_KIND(str);
5352 data = PyUnicode_DATA(str);
5353 len = PyUnicode_GET_LENGTH(str);
5354
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005355 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005356 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005357 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005358 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005359 if (v == NULL)
5360 return NULL;
5361
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005362 /* output buffer is 4-bytes aligned */
5363 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5364 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005365 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005366 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005367 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005368 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005369
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005370 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005371 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005372 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005373 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005374 else
5375 encoding = "utf-32";
5376
5377 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005378 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5379 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005380 }
5381
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005382 pos = 0;
5383 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005384 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005385
5386 if (kind == PyUnicode_2BYTE_KIND) {
5387 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5388 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005389 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005390 else {
5391 assert(kind == PyUnicode_4BYTE_KIND);
5392 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5393 &out, native_ordering);
5394 }
5395 if (pos == len)
5396 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005397
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005398 rep = unicode_encode_call_errorhandler(
5399 errors, &errorHandler,
5400 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005401 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005402 if (!rep)
5403 goto error;
5404
5405 if (PyBytes_Check(rep)) {
5406 repsize = PyBytes_GET_SIZE(rep);
5407 if (repsize & 3) {
5408 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005409 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005410 "surrogates not allowed");
5411 goto error;
5412 }
5413 moreunits = repsize / 4;
5414 }
5415 else {
5416 assert(PyUnicode_Check(rep));
5417 if (PyUnicode_READY(rep) < 0)
5418 goto error;
5419 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5420 if (!PyUnicode_IS_ASCII(rep)) {
5421 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005422 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005423 "surrogates not allowed");
5424 goto error;
5425 }
5426 }
5427
5428 /* four bytes are reserved for each surrogate */
5429 if (moreunits > 1) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005430 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005431 Py_ssize_t morebytes = 4 * (moreunits - 1);
5432 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5433 /* integer overflow */
5434 PyErr_NoMemory();
5435 goto error;
5436 }
5437 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5438 goto error;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005439 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005440 }
5441
5442 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005443 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5444 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005445 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005446 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005447 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5448 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005449 }
5450
5451 Py_CLEAR(rep);
5452 }
5453
5454 /* Cut back to size actually needed. This is necessary for, for example,
5455 encoding of a string containing isolated surrogates and the 'ignore'
5456 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005457 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005458 if (nsize != PyBytes_GET_SIZE(v))
5459 _PyBytes_Resize(&v, nsize);
5460 Py_XDECREF(errorHandler);
5461 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005462 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005463 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005464 error:
5465 Py_XDECREF(rep);
5466 Py_XDECREF(errorHandler);
5467 Py_XDECREF(exc);
5468 Py_XDECREF(v);
5469 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005470}
5471
Alexander Belopolsky40018472011-02-26 01:02:56 +00005472PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005473PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5474 Py_ssize_t size,
5475 const char *errors,
5476 int byteorder)
5477{
5478 PyObject *result;
5479 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5480 if (tmp == NULL)
5481 return NULL;
5482 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5483 Py_DECREF(tmp);
5484 return result;
5485}
5486
5487PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005488PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005489{
Victor Stinnerb960b342011-11-20 19:12:52 +01005490 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005491}
5492
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493/* --- UTF-16 Codec ------------------------------------------------------- */
5494
Tim Peters772747b2001-08-09 22:21:55 +00005495PyObject *
5496PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 Py_ssize_t size,
5498 const char *errors,
5499 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500{
Walter Dörwald69652032004-09-07 20:24:22 +00005501 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5502}
5503
5504PyObject *
5505PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 Py_ssize_t size,
5507 const char *errors,
5508 int *byteorder,
5509 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005510{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005511 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005512 Py_ssize_t startinpos;
5513 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005514 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005515 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005516 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005517 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005518 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005519 PyObject *errorHandler = NULL;
5520 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005521 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522
Tim Peters772747b2001-08-09 22:21:55 +00005523 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005524 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525
5526 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005527 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005529 /* Check for BOM marks (U+FEFF) in the input and adjust current
5530 byte order setting accordingly. In native mode, the leading BOM
5531 mark is skipped, in all other modes, it is copied to the output
5532 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005533 if (bo == 0 && size >= 2) {
5534 const Py_UCS4 bom = (q[1] << 8) | q[0];
5535 if (bom == 0xFEFF) {
5536 q += 2;
5537 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005539 else if (bom == 0xFFFE) {
5540 q += 2;
5541 bo = 1;
5542 }
5543 if (byteorder)
5544 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005545 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546
Antoine Pitrou63065d72012-05-15 23:48:04 +02005547 if (q == e) {
5548 if (consumed)
5549 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005550 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005551 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005552
Christian Heimes743e0cd2012-10-17 23:52:17 +02005553#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005554 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005555 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005556#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005557 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005558 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005559#endif
Tim Peters772747b2001-08-09 22:21:55 +00005560
Antoine Pitrou63065d72012-05-15 23:48:04 +02005561 /* Note: size will always be longer than the resulting Unicode
5562 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005563 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005564 writer.min_length = (e - q + 1) / 2;
5565 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005566 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005567
Antoine Pitrou63065d72012-05-15 23:48:04 +02005568 while (1) {
5569 Py_UCS4 ch = 0;
5570 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005571 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005572 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005573 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005574 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005575 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005576 native_ordering);
5577 else
5578 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005579 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005580 native_ordering);
5581 } else if (kind == PyUnicode_2BYTE_KIND) {
5582 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005583 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005584 native_ordering);
5585 } else {
5586 assert(kind == PyUnicode_4BYTE_KIND);
5587 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005588 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005589 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005590 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005591 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005592
Antoine Pitrou63065d72012-05-15 23:48:04 +02005593 switch (ch)
5594 {
5595 case 0:
5596 /* remaining byte at the end? (size should be even) */
5597 if (q == e || consumed)
5598 goto End;
5599 errmsg = "truncated data";
5600 startinpos = ((const char *)q) - starts;
5601 endinpos = ((const char *)e) - starts;
5602 break;
5603 /* The remaining input chars are ignored if the callback
5604 chooses to skip the input */
5605 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005606 q -= 2;
5607 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005608 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005609 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005610 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611 endinpos = ((const char *)e) - starts;
5612 break;
5613 case 2:
5614 errmsg = "illegal encoding";
5615 startinpos = ((const char *)q) - 2 - starts;
5616 endinpos = startinpos + 2;
5617 break;
5618 case 3:
5619 errmsg = "illegal UTF-16 surrogate";
5620 startinpos = ((const char *)q) - 4 - starts;
5621 endinpos = startinpos + 2;
5622 break;
5623 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005624 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005625 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 continue;
5627 }
5628
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005629 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005630 errors,
5631 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005632 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005633 &starts,
5634 (const char **)&e,
5635 &startinpos,
5636 &endinpos,
5637 &exc,
5638 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005639 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005640 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 }
5642
Antoine Pitrou63065d72012-05-15 23:48:04 +02005643End:
Walter Dörwald69652032004-09-07 20:24:22 +00005644 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005645 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005646
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005647 Py_XDECREF(errorHandler);
5648 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005649 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650
Benjamin Peterson29060642009-01-31 22:14:21 +00005651 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005652 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 Py_XDECREF(errorHandler);
5654 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 return NULL;
5656}
5657
Tim Peters772747b2001-08-09 22:21:55 +00005658PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005659_PyUnicode_EncodeUTF16(PyObject *str,
5660 const char *errors,
5661 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005663 enum PyUnicode_Kind kind;
5664 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005665 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005666 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005667 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005668 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005669#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005670 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005671#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005672 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005673#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005674 const char *encoding;
5675 Py_ssize_t nsize, pos;
5676 PyObject *errorHandler = NULL;
5677 PyObject *exc = NULL;
5678 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005679
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005680 if (!PyUnicode_Check(str)) {
5681 PyErr_BadArgument();
5682 return NULL;
5683 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005684 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005685 return NULL;
5686 kind = PyUnicode_KIND(str);
5687 data = PyUnicode_DATA(str);
5688 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005689
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005690 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005691 if (kind == PyUnicode_4BYTE_KIND) {
5692 const Py_UCS4 *in = (const Py_UCS4 *)data;
5693 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005694 while (in < end) {
5695 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005696 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005697 }
5698 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005699 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005700 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005702 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005703 nsize = len + pairs + (byteorder == 0);
5704 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005705 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005709 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005710 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005711 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005712 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005713 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005714 }
5715 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005716 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005717 }
Tim Peters772747b2001-08-09 22:21:55 +00005718
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005719 if (kind == PyUnicode_1BYTE_KIND) {
5720 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5721 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005722 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005723
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005724 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005725 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005726 }
5727 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005728 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005729 }
5730 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005731 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005732 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005733
5734 pos = 0;
5735 while (pos < len) {
5736 Py_ssize_t repsize, moreunits;
5737
5738 if (kind == PyUnicode_2BYTE_KIND) {
5739 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5740 &out, native_ordering);
5741 }
5742 else {
5743 assert(kind == PyUnicode_4BYTE_KIND);
5744 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5745 &out, native_ordering);
5746 }
5747 if (pos == len)
5748 break;
5749
5750 rep = unicode_encode_call_errorhandler(
5751 errors, &errorHandler,
5752 encoding, "surrogates not allowed",
5753 str, &exc, pos, pos + 1, &pos);
5754 if (!rep)
5755 goto error;
5756
5757 if (PyBytes_Check(rep)) {
5758 repsize = PyBytes_GET_SIZE(rep);
5759 if (repsize & 1) {
5760 raise_encode_exception(&exc, encoding,
5761 str, pos - 1, pos,
5762 "surrogates not allowed");
5763 goto error;
5764 }
5765 moreunits = repsize / 2;
5766 }
5767 else {
5768 assert(PyUnicode_Check(rep));
5769 if (PyUnicode_READY(rep) < 0)
5770 goto error;
5771 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5772 if (!PyUnicode_IS_ASCII(rep)) {
5773 raise_encode_exception(&exc, encoding,
5774 str, pos - 1, pos,
5775 "surrogates not allowed");
5776 goto error;
5777 }
5778 }
5779
5780 /* two bytes are reserved for each surrogate */
5781 if (moreunits > 1) {
5782 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5783 Py_ssize_t morebytes = 2 * (moreunits - 1);
5784 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5785 /* integer overflow */
5786 PyErr_NoMemory();
5787 goto error;
5788 }
5789 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5790 goto error;
5791 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5792 }
5793
5794 if (PyBytes_Check(rep)) {
5795 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5796 out += moreunits;
5797 } else /* rep is unicode */ {
5798 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5799 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5800 &out, native_ordering);
5801 }
5802
5803 Py_CLEAR(rep);
5804 }
5805
5806 /* Cut back to size actually needed. This is necessary for, for example,
5807 encoding of a string containing isolated surrogates and the 'ignore' handler
5808 is used. */
5809 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5810 if (nsize != PyBytes_GET_SIZE(v))
5811 _PyBytes_Resize(&v, nsize);
5812 Py_XDECREF(errorHandler);
5813 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005814 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005815 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005816 error:
5817 Py_XDECREF(rep);
5818 Py_XDECREF(errorHandler);
5819 Py_XDECREF(exc);
5820 Py_XDECREF(v);
5821 return NULL;
5822#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823}
5824
Alexander Belopolsky40018472011-02-26 01:02:56 +00005825PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005826PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5827 Py_ssize_t size,
5828 const char *errors,
5829 int byteorder)
5830{
5831 PyObject *result;
5832 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5833 if (tmp == NULL)
5834 return NULL;
5835 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5836 Py_DECREF(tmp);
5837 return result;
5838}
5839
5840PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005841PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005843 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844}
5845
5846/* --- Unicode Escape Codec ----------------------------------------------- */
5847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005848/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5849 if all the escapes in the string make it still a valid ASCII string.
5850 Returns -1 if any escapes were found which cause the string to
5851 pop out of ASCII range. Otherwise returns the length of the
5852 required buffer to hold the string.
5853 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005854static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005855length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5856{
5857 const unsigned char *p = (const unsigned char *)s;
5858 const unsigned char *end = p + size;
5859 Py_ssize_t length = 0;
5860
5861 if (size < 0)
5862 return -1;
5863
5864 for (; p < end; ++p) {
5865 if (*p > 127) {
5866 /* Non-ASCII */
5867 return -1;
5868 }
5869 else if (*p != '\\') {
5870 /* Normal character */
5871 ++length;
5872 }
5873 else {
5874 /* Backslash-escape, check next char */
5875 ++p;
5876 /* Escape sequence reaches till end of string or
5877 non-ASCII follow-up. */
5878 if (p >= end || *p > 127)
5879 return -1;
5880 switch (*p) {
5881 case '\n':
5882 /* backslash + \n result in zero characters */
5883 break;
5884 case '\\': case '\'': case '\"':
5885 case 'b': case 'f': case 't':
5886 case 'n': case 'r': case 'v': case 'a':
5887 ++length;
5888 break;
5889 case '0': case '1': case '2': case '3':
5890 case '4': case '5': case '6': case '7':
5891 case 'x': case 'u': case 'U': case 'N':
5892 /* these do not guarantee ASCII characters */
5893 return -1;
5894 default:
5895 /* count the backslash + the other character */
5896 length += 2;
5897 }
5898 }
5899 }
5900 return length;
5901}
5902
Fredrik Lundh06d12682001-01-24 07:59:11 +00005903static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005904
Alexander Belopolsky40018472011-02-26 01:02:56 +00005905PyObject *
5906PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005907 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005908 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005910 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005911 Py_ssize_t startinpos;
5912 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005913 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005915 char* message;
5916 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005917 PyObject *errorHandler = NULL;
5918 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005919 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005920
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005921 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005922 if (len == 0)
5923 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005924
5925 /* After length_of_escaped_ascii_string() there are two alternatives,
5926 either the string is pure ASCII with named escapes like \n, etc.
5927 and we determined it's exact size (common case)
5928 or it contains \x, \u, ... escape sequences. then we create a
5929 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005930 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005931 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005932 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005933 }
5934 else {
5935 /* Escaped strings will always be longer than the resulting
5936 Unicode string, so we start with size here and then reduce the
5937 length after conversion to the true value.
5938 (but if the error callback returns a long replacement string
5939 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005940 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005941 }
5942
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005944 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005946
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 while (s < end) {
5948 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005949 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005950 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951
5952 /* Non-escape characters are interpreted as Unicode ordinals */
5953 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005954 x = (unsigned char)*s;
5955 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005956 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005957 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 continue;
5959 }
5960
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005961 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 /* \ - Escapes */
5963 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005964 c = *s++;
5965 if (s > end)
5966 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005967
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005968 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005971#define WRITECHAR(ch) \
5972 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005973 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005974 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005975 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005976
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005978 case '\\': WRITECHAR('\\'); break;
5979 case '\'': WRITECHAR('\''); break;
5980 case '\"': WRITECHAR('\"'); break;
5981 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005982 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005983 case 'f': WRITECHAR('\014'); break;
5984 case 't': WRITECHAR('\t'); break;
5985 case 'n': WRITECHAR('\n'); break;
5986 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005987 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005988 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005989 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005990 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 case '0': case '1': case '2': case '3':
5994 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005995 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005996 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005997 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005998 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005999 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006001 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 break;
6003
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 /* hex escapes */
6005 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006007 digits = 2;
6008 message = "truncated \\xXX escape";
6009 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006013 digits = 4;
6014 message = "truncated \\uXXXX escape";
6015 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006018 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006019 digits = 8;
6020 message = "truncated \\UXXXXXXXX escape";
6021 hexescape:
6022 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02006023 if (end - s < digits) {
6024 /* count only hex digits */
6025 for (; s < end; ++s) {
6026 c = (unsigned char)*s;
6027 if (!Py_ISXDIGIT(c))
6028 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006029 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006030 goto error;
6031 }
6032 for (; digits--; ++s) {
6033 c = (unsigned char)*s;
6034 if (!Py_ISXDIGIT(c))
6035 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006036 chr = (chr<<4) & ~0xF;
6037 if (c >= '0' && c <= '9')
6038 chr += c - '0';
6039 else if (c >= 'a' && c <= 'f')
6040 chr += 10 + c - 'a';
6041 else
6042 chr += 10 + c - 'A';
6043 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00006044 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006045 /* _decoding_error will have already written into the
6046 target buffer. */
6047 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006048 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00006049 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02006050 message = "illegal Unicode character";
6051 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02006052 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02006053 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006054 break;
6055
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006057 case 'N':
6058 message = "malformed \\N character escape";
6059 if (ucnhash_CAPI == NULL) {
6060 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006061 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6062 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006063 if (ucnhash_CAPI == NULL)
6064 goto ucnhashError;
6065 }
6066 if (*s == '{') {
6067 const char *start = s+1;
6068 /* look for the closing brace */
6069 while (*s != '}' && s < end)
6070 s++;
6071 if (s > start && s < end && *s == '}') {
6072 /* found a name. look it up in the unicode database */
6073 message = "unknown Unicode character name";
6074 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02006075 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02006076 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006077 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006078 goto store;
6079 }
6080 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006081 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006082
6083 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006084 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006085 message = "\\ at end of string";
6086 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02006087 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00006088 }
6089 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006090 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02006091 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006092 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006093 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006095 continue;
6096
6097 error:
6098 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006099 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006100 errors, &errorHandler,
6101 "unicodeescape", message,
6102 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006103 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02006104 goto onError;
6105 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006107#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006108
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006109 Py_XDECREF(errorHandler);
6110 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006111 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006112
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006114 PyErr_SetString(
6115 PyExc_UnicodeError,
6116 "\\N escapes not supported (can't load unicodedata module)"
6117 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006118 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006119 Py_XDECREF(errorHandler);
6120 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006121 return NULL;
6122
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006124 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006125 Py_XDECREF(errorHandler);
6126 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 return NULL;
6128}
6129
6130/* Return a Unicode-Escape string version of the Unicode object.
6131
6132 If quotes is true, the string is enclosed in u"" or u'' quotes as
6133 appropriate.
6134
6135*/
6136
Alexander Belopolsky40018472011-02-26 01:02:56 +00006137PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006138PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006140 Py_ssize_t i, len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006142 int kind;
6143 void *data;
Victor Stinner358af132015-10-12 22:36:57 +02006144 _PyBytesWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145
Ezio Melottie7f90372012-10-05 03:33:31 +03006146 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006147 escape.
6148
Ezio Melottie7f90372012-10-05 03:33:31 +03006149 For UCS1 strings it's '\xxx', 4 bytes per source character.
6150 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6151 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006152 */
6153
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154 if (!PyUnicode_Check(unicode)) {
6155 PyErr_BadArgument();
6156 return NULL;
6157 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006158 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159 return NULL;
Victor Stinner358af132015-10-12 22:36:57 +02006160
6161 _PyBytesWriter_Init(&writer);
6162
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006163 len = PyUnicode_GET_LENGTH(unicode);
6164 kind = PyUnicode_KIND(unicode);
6165 data = PyUnicode_DATA(unicode);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006166
Victor Stinner358af132015-10-12 22:36:57 +02006167 p = _PyBytesWriter_Alloc(&writer, len);
6168 if (p == NULL)
6169 goto error;
6170 writer.overallocate = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006173 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006174
Walter Dörwald79e913e2007-05-12 11:08:06 +00006175 /* Escape backslashes */
6176 if (ch == '\\') {
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006177 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006178 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6179 if (p == NULL)
6180 goto error;
6181
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 *p++ = '\\';
6183 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006184 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006185 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006186
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006187 /* Map 21-bit characters to '\U00xxxxxx' */
6188 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006189 assert(ch <= MAX_UNICODE);
Victor Stinner358af132015-10-12 22:36:57 +02006190
6191 p = _PyBytesWriter_Prepare(&writer, p, 10-1);
6192 if (p == NULL)
6193 goto error;
6194
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006195 *p++ = '\\';
6196 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006197 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6198 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6199 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6200 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6201 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6202 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6203 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6204 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006206 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006207
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006209 if (ch >= 256) {
Victor Stinner358af132015-10-12 22:36:57 +02006210 p = _PyBytesWriter_Prepare(&writer, p, 6-1);
6211 if (p == NULL)
6212 goto error;
6213
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 *p++ = '\\';
6215 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006216 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6217 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6218 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6219 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006221
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006222 /* Map special whitespace to '\t', \n', '\r' */
6223 else if (ch == '\t') {
Victor Stinner358af132015-10-12 22:36:57 +02006224 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6225 if (p == NULL)
6226 goto error;
6227
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006228 *p++ = '\\';
6229 *p++ = 't';
6230 }
6231 else if (ch == '\n') {
Victor Stinner358af132015-10-12 22:36:57 +02006232 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6233 if (p == NULL)
6234 goto error;
6235
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006236 *p++ = '\\';
6237 *p++ = 'n';
6238 }
6239 else if (ch == '\r') {
Victor Stinner358af132015-10-12 22:36:57 +02006240 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6241 if (p == NULL)
6242 goto error;
6243
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006244 *p++ = '\\';
6245 *p++ = 'r';
6246 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006247
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006248 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006249 else if (ch < ' ' || ch >= 0x7F) {
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006250 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006251 p = _PyBytesWriter_Prepare(&writer, p, 4-1);
6252 if (p == NULL)
6253 goto error;
6254
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006256 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006257 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6258 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006259 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006260
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 /* Copy everything else as-is */
6262 else
6263 *p++ = (char) ch;
6264 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265
Victor Stinner358af132015-10-12 22:36:57 +02006266 return _PyBytesWriter_Finish(&writer, p);
6267
6268error:
6269 _PyBytesWriter_Dealloc(&writer);
6270 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271}
6272
Alexander Belopolsky40018472011-02-26 01:02:56 +00006273PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006274PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6275 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006277 PyObject *result;
6278 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6279 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006281 result = PyUnicode_AsUnicodeEscapeString(tmp);
6282 Py_DECREF(tmp);
6283 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284}
6285
6286/* --- Raw Unicode Escape Codec ------------------------------------------- */
6287
Alexander Belopolsky40018472011-02-26 01:02:56 +00006288PyObject *
6289PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006290 Py_ssize_t size,
6291 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006293 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006294 Py_ssize_t startinpos;
6295 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006296 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 const char *end;
6298 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006299 PyObject *errorHandler = NULL;
6300 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006301
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006302 if (size == 0)
6303 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006304
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 /* Escaped strings will always be longer than the resulting
6306 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006307 length after conversion to the true value. (But decoding error
6308 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006309 _PyUnicodeWriter_Init(&writer);
6310 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006311
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 end = s + size;
6313 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006314 unsigned char c;
6315 Py_UCS4 x;
6316 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006317 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 /* Non-escape characters are interpreted as Unicode ordinals */
6320 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006321 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006322 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006323 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006325 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 startinpos = s-starts;
6327
6328 /* \u-escapes are only interpreted iff the number of leading
6329 backslashes if odd */
6330 bs = s;
6331 for (;s < end;) {
6332 if (*s != '\\')
6333 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006334 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006335 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006336 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 }
6338 if (((s - bs) & 1) == 0 ||
6339 s >= end ||
6340 (*s != 'u' && *s != 'U')) {
6341 continue;
6342 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006343 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 count = *s=='u' ? 4 : 8;
6345 s++;
6346
6347 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006348 for (x = 0, i = 0; i < count; ++i, ++s) {
6349 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006350 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006352 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 errors, &errorHandler,
6354 "rawunicodeescape", "truncated \\uXXXX",
6355 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006356 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 goto onError;
6358 goto nextByte;
6359 }
6360 x = (x<<4) & ~0xF;
6361 if (c >= '0' && c <= '9')
6362 x += c - '0';
6363 else if (c >= 'a' && c <= 'f')
6364 x += 10 + c - 'a';
6365 else
6366 x += 10 + c - 'A';
6367 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006368 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006369 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006370 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006371 }
6372 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006373 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006374 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006375 errors, &errorHandler,
6376 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006378 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006380 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 nextByte:
6382 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006384 Py_XDECREF(errorHandler);
6385 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006386 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006387
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006389 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006390 Py_XDECREF(errorHandler);
6391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392 return NULL;
6393}
6394
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006395
Alexander Belopolsky40018472011-02-26 01:02:56 +00006396PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006397PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 char *p;
Victor Stinner358af132015-10-12 22:36:57 +02006400 Py_ssize_t pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006401 int kind;
6402 void *data;
6403 Py_ssize_t len;
Victor Stinner358af132015-10-12 22:36:57 +02006404 _PyBytesWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006406 if (!PyUnicode_Check(unicode)) {
6407 PyErr_BadArgument();
6408 return NULL;
6409 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006410 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411 return NULL;
Victor Stinner358af132015-10-12 22:36:57 +02006412
6413 _PyBytesWriter_Init(&writer);
6414
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006415 kind = PyUnicode_KIND(unicode);
6416 data = PyUnicode_DATA(unicode);
6417 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner0e368262011-11-10 20:12:49 +01006418
Victor Stinner358af132015-10-12 22:36:57 +02006419 p = _PyBytesWriter_Alloc(&writer, len);
6420 if (p == NULL)
6421 goto error;
6422 writer.overallocate = 1;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006423
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006424 for (pos = 0; pos < len; pos++) {
6425 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 /* Map 32-bit characters to '\Uxxxxxxxx' */
6427 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006428 assert(ch <= MAX_UNICODE);
Victor Stinner358af132015-10-12 22:36:57 +02006429
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006430 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006431 p = _PyBytesWriter_Prepare(&writer, p, 10-1);
6432 if (p == NULL)
6433 goto error;
6434
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006435 *p++ = '\\';
6436 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006437 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6438 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6439 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6440 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6441 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6442 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6443 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6444 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006445 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006447 else if (ch >= 256) {
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006448 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006449 p = _PyBytesWriter_Prepare(&writer, p, 6-1);
6450 if (p == NULL)
6451 goto error;
6452
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 *p++ = '\\';
6454 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006455 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6456 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6457 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6458 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 /* Copy everything else as-is */
6461 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 *p++ = (char) ch;
6463 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006464
Victor Stinner358af132015-10-12 22:36:57 +02006465 return _PyBytesWriter_Finish(&writer, p);
6466
6467error:
6468 _PyBytesWriter_Dealloc(&writer);
6469 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470}
6471
Alexander Belopolsky40018472011-02-26 01:02:56 +00006472PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006473PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6474 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006476 PyObject *result;
6477 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6478 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006479 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006480 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6481 Py_DECREF(tmp);
6482 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483}
6484
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006485/* --- Unicode Internal Codec ------------------------------------------- */
6486
Alexander Belopolsky40018472011-02-26 01:02:56 +00006487PyObject *
6488_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006489 Py_ssize_t size,
6490 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006491{
6492 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006493 Py_ssize_t startinpos;
6494 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006495 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006496 const char *end;
6497 const char *reason;
6498 PyObject *errorHandler = NULL;
6499 PyObject *exc = NULL;
6500
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006501 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006502 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006503 1))
6504 return NULL;
6505
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006506 if (size == 0)
6507 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006508
Victor Stinner8f674cc2013-04-17 23:02:17 +02006509 _PyUnicodeWriter_Init(&writer);
6510 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6511 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006513 }
6514 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006515
Victor Stinner8f674cc2013-04-17 23:02:17 +02006516 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006517 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006518 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006519 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006520 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006521 endinpos = end-starts;
6522 reason = "truncated input";
6523 goto error;
6524 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006525 /* We copy the raw representation one byte at a time because the
6526 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006527 ((char *) &uch)[0] = s[0];
6528 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006529#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006530 ((char *) &uch)[2] = s[2];
6531 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006532#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006533 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006534#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006535 /* We have to sanity check the raw data, otherwise doom looms for
6536 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006537 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006538 endinpos = s - starts + Py_UNICODE_SIZE;
6539 reason = "illegal code point (> 0x10FFFF)";
6540 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006541 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006542#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006543 s += Py_UNICODE_SIZE;
6544#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006545 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006546 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006547 Py_UNICODE uch2;
6548 ((char *) &uch2)[0] = s[0];
6549 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006550 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006551 {
Victor Stinner551ac952011-11-29 22:58:13 +01006552 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006553 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006554 }
6555 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006556#endif
6557
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006558 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006559 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006560 continue;
6561
6562 error:
6563 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006564 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006565 errors, &errorHandler,
6566 "unicode_internal", reason,
6567 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006568 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006569 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006570 }
6571
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006572 Py_XDECREF(errorHandler);
6573 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006574 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006575
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006577 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006578 Py_XDECREF(errorHandler);
6579 Py_XDECREF(exc);
6580 return NULL;
6581}
6582
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583/* --- Latin-1 Codec ------------------------------------------------------ */
6584
Alexander Belopolsky40018472011-02-26 01:02:56 +00006585PyObject *
6586PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006587 Py_ssize_t size,
6588 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006591 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592}
6593
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006594/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006595static void
6596make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006597 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006598 PyObject *unicode,
6599 Py_ssize_t startpos, Py_ssize_t endpos,
6600 const char *reason)
6601{
6602 if (*exceptionObject == NULL) {
6603 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006604 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006605 encoding, unicode, startpos, endpos, reason);
6606 }
6607 else {
6608 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6609 goto onError;
6610 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6611 goto onError;
6612 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6613 goto onError;
6614 return;
6615 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006616 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006617 }
6618}
6619
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006620/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006621static void
6622raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006623 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006624 PyObject *unicode,
6625 Py_ssize_t startpos, Py_ssize_t endpos,
6626 const char *reason)
6627{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006628 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006629 encoding, unicode, startpos, endpos, reason);
6630 if (*exceptionObject != NULL)
6631 PyCodec_StrictErrors(*exceptionObject);
6632}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006633
6634/* error handling callback helper:
6635 build arguments, call the callback and check the arguments,
6636 put the result into newpos and return the replacement string, which
6637 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006638static PyObject *
6639unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006640 PyObject **errorHandler,
6641 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006642 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006643 Py_ssize_t startpos, Py_ssize_t endpos,
6644 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006645{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006646 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006647 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006648 PyObject *restuple;
6649 PyObject *resunicode;
6650
6651 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006652 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006653 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006655 }
6656
Benjamin Petersonbac79492012-01-14 13:34:47 -05006657 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006658 return NULL;
6659 len = PyUnicode_GET_LENGTH(unicode);
6660
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006661 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006662 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006663 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665
6666 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006670 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006671 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 Py_DECREF(restuple);
6673 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006674 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006675 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006676 &resunicode, newpos)) {
6677 Py_DECREF(restuple);
6678 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006679 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006680 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6681 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6682 Py_DECREF(restuple);
6683 return NULL;
6684 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006686 *newpos = len + *newpos;
6687 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006688 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 Py_DECREF(restuple);
6690 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006691 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006692 Py_INCREF(resunicode);
6693 Py_DECREF(restuple);
6694 return resunicode;
6695}
6696
Alexander Belopolsky40018472011-02-26 01:02:56 +00006697static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006698unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006699 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006700 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006701{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006702 /* input state */
6703 Py_ssize_t pos=0, size;
6704 int kind;
6705 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006706 /* pointer into the output */
6707 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006708 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6709 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006710 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006711 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006712 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006713 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006714 /* output object */
6715 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006716
Benjamin Petersonbac79492012-01-14 13:34:47 -05006717 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006718 return NULL;
6719 size = PyUnicode_GET_LENGTH(unicode);
6720 kind = PyUnicode_KIND(unicode);
6721 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722 /* allocate enough for a simple encoding without
6723 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006724 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006725 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006726
6727 _PyBytesWriter_Init(&writer);
6728 str = _PyBytesWriter_Alloc(&writer, size);
6729 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006730 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006731
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006732 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006733 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006734
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006736 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006738 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006739 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006740 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006742 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006744 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006745 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006747
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006748 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006749 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006750
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006751 /* Only overallocate the buffer if it's not the last write */
6752 writer.overallocate = (collend < size);
6753
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006755 if (error_handler == _Py_ERROR_UNKNOWN)
6756 error_handler = get_error_handler(errors);
6757
6758 switch (error_handler) {
6759 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006760 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006762
6763 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006764 memset(str, '?', collend - collstart);
6765 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006766 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006767 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006768 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 break;
Victor Stinner50149202015-09-22 00:26:54 +02006770
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006771 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006772 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006773 writer.min_size -= (collend - collstart);
6774 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006775 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006776 if (str == NULL)
6777 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006778 pos = collend;
6779 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006780
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006781 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006782 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006783 writer.min_size -= (collend - collstart);
6784 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006785 unicode, collstart, collend);
6786 if (str == NULL)
6787 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006788 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 break;
Victor Stinner50149202015-09-22 00:26:54 +02006790
Victor Stinnerc3713e92015-09-29 12:32:13 +02006791 case _Py_ERROR_SURROGATEESCAPE:
6792 for (i = collstart; i < collend; ++i) {
6793 ch = PyUnicode_READ(kind, data, i);
6794 if (ch < 0xdc80 || 0xdcff < ch) {
6795 /* Not a UTF-8b surrogate */
6796 break;
6797 }
6798 *str++ = (char)(ch - 0xdc00);
6799 ++pos;
6800 }
6801 if (i >= collend)
6802 break;
6803 collstart = pos;
6804 assert(collstart != collend);
6805 /* fallback to general error handling */
6806
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006808 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6809 encoding, reason, unicode, &exc,
6810 collstart, collend, &newpos);
6811 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006812 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006813
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006814 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006815 writer.min_size -= 1;
6816
Victor Stinner6bd525b2015-10-09 13:10:05 +02006817 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006818 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006819 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006820 PyBytes_AS_STRING(rep),
6821 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006822 if (str == NULL)
6823 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006824 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006825 else {
6826 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006827
Victor Stinner6bd525b2015-10-09 13:10:05 +02006828 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006830
6831 if (PyUnicode_IS_ASCII(rep)) {
6832 /* Fast path: all characters are smaller than limit */
6833 assert(limit >= 128);
6834 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6835 str = _PyBytesWriter_WriteBytes(&writer, str,
6836 PyUnicode_DATA(rep),
6837 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006838 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006839 else {
6840 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6841
6842 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6843 if (str == NULL)
6844 goto onError;
6845
6846 /* check if there is anything unencodable in the
6847 replacement and copy it to the output */
6848 for (i = 0; repsize-->0; ++i, ++str) {
6849 ch = PyUnicode_READ_CHAR(rep, i);
6850 if (ch >= limit) {
6851 raise_encode_exception(&exc, encoding, unicode,
6852 pos, pos+1, reason);
6853 goto onError;
6854 }
6855 *str = (char)ch;
6856 }
6857 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006859 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006860 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006861 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006862
6863 /* If overallocation was disabled, ensure that it was the last
6864 write. Otherwise, we missed an optimization */
6865 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006866 }
6867 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006868
Victor Stinner50149202015-09-22 00:26:54 +02006869 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006870 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006871 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006872
6873 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006874 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006875 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006876 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006877 Py_XDECREF(exc);
6878 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006879}
6880
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006881/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006882PyObject *
6883PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006884 Py_ssize_t size,
6885 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006887 PyObject *result;
6888 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6889 if (unicode == NULL)
6890 return NULL;
6891 result = unicode_encode_ucs1(unicode, errors, 256);
6892 Py_DECREF(unicode);
6893 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894}
6895
Alexander Belopolsky40018472011-02-26 01:02:56 +00006896PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006897_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898{
6899 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006900 PyErr_BadArgument();
6901 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006903 if (PyUnicode_READY(unicode) == -1)
6904 return NULL;
6905 /* Fast path: if it is a one-byte string, construct
6906 bytes object directly. */
6907 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6908 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6909 PyUnicode_GET_LENGTH(unicode));
6910 /* Non-Latin-1 characters present. Defer to above function to
6911 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006912 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006913}
6914
6915PyObject*
6916PyUnicode_AsLatin1String(PyObject *unicode)
6917{
6918 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919}
6920
6921/* --- 7-bit ASCII Codec -------------------------------------------------- */
6922
Alexander Belopolsky40018472011-02-26 01:02:56 +00006923PyObject *
6924PyUnicode_DecodeASCII(const char *s,
6925 Py_ssize_t size,
6926 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006928 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006929 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006930 int kind;
6931 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006932 Py_ssize_t startinpos;
6933 Py_ssize_t endinpos;
6934 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006935 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006936 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006937 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006938 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006939
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006941 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006942
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006944 if (size == 1 && (unsigned char)s[0] < 128)
6945 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006946
Victor Stinner8f674cc2013-04-17 23:02:17 +02006947 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006948 writer.min_length = size;
6949 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006950 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006951
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006952 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006953 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006954 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006955 writer.pos = outpos;
6956 if (writer.pos == size)
6957 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006958
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006959 s += writer.pos;
6960 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006961 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006962 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006963 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006964 PyUnicode_WRITE(kind, data, writer.pos, c);
6965 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006966 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006967 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006969
6970 /* byte outsize range 0x00..0x7f: call the error handler */
6971
6972 if (error_handler == _Py_ERROR_UNKNOWN)
6973 error_handler = get_error_handler(errors);
6974
6975 switch (error_handler)
6976 {
6977 case _Py_ERROR_REPLACE:
6978 case _Py_ERROR_SURROGATEESCAPE:
6979 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006980 but we may switch to UCS2 at the first write */
6981 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6982 goto onError;
6983 kind = writer.kind;
6984 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006985
6986 if (error_handler == _Py_ERROR_REPLACE)
6987 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6988 else
6989 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6990 writer.pos++;
6991 ++s;
6992 break;
6993
6994 case _Py_ERROR_IGNORE:
6995 ++s;
6996 break;
6997
6998 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006999 startinpos = s-starts;
7000 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007001 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007002 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 "ascii", "ordinal not in range(128)",
7004 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007005 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007007 kind = writer.kind;
7008 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007011 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007012 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007013 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007014
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007016 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007017 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007018 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 return NULL;
7020}
7021
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007022/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007023PyObject *
7024PyUnicode_EncodeASCII(const Py_UNICODE *p,
7025 Py_ssize_t size,
7026 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007028 PyObject *result;
7029 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7030 if (unicode == NULL)
7031 return NULL;
7032 result = unicode_encode_ucs1(unicode, errors, 128);
7033 Py_DECREF(unicode);
7034 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035}
7036
Alexander Belopolsky40018472011-02-26 01:02:56 +00007037PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007038_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039{
7040 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007041 PyErr_BadArgument();
7042 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007044 if (PyUnicode_READY(unicode) == -1)
7045 return NULL;
7046 /* Fast path: if it is an ASCII-only string, construct bytes object
7047 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007048 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007049 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7050 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007051 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007052}
7053
7054PyObject *
7055PyUnicode_AsASCIIString(PyObject *unicode)
7056{
7057 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058}
7059
Victor Stinner99b95382011-07-04 14:23:54 +02007060#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007061
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007062/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007063
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007064#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007065#define NEED_RETRY
7066#endif
7067
Victor Stinner3a50e702011-10-18 21:21:00 +02007068#ifndef WC_ERR_INVALID_CHARS
7069# define WC_ERR_INVALID_CHARS 0x0080
7070#endif
7071
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007072static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007073code_page_name(UINT code_page, PyObject **obj)
7074{
7075 *obj = NULL;
7076 if (code_page == CP_ACP)
7077 return "mbcs";
7078 if (code_page == CP_UTF7)
7079 return "CP_UTF7";
7080 if (code_page == CP_UTF8)
7081 return "CP_UTF8";
7082
7083 *obj = PyBytes_FromFormat("cp%u", code_page);
7084 if (*obj == NULL)
7085 return NULL;
7086 return PyBytes_AS_STRING(*obj);
7087}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088
Victor Stinner3a50e702011-10-18 21:21:00 +02007089static DWORD
7090decode_code_page_flags(UINT code_page)
7091{
7092 if (code_page == CP_UTF7) {
7093 /* The CP_UTF7 decoder only supports flags=0 */
7094 return 0;
7095 }
7096 else
7097 return MB_ERR_INVALID_CHARS;
7098}
7099
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007101 * Decode a byte string from a Windows code page into unicode object in strict
7102 * mode.
7103 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007104 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7105 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007106 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007107static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007108decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007109 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007110 const char *in,
7111 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007112{
Victor Stinner3a50e702011-10-18 21:21:00 +02007113 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007114 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007115 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007116
7117 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007118 assert(insize > 0);
7119 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7120 if (outsize <= 0)
7121 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007122
7123 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007124 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007125 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007126 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007127 if (*v == NULL)
7128 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007129 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007130 }
7131 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007133 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007134 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007136 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137 }
7138
7139 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7141 if (outsize <= 0)
7142 goto error;
7143 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007144
Victor Stinner3a50e702011-10-18 21:21:00 +02007145error:
7146 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7147 return -2;
7148 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007149 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007150}
7151
Victor Stinner3a50e702011-10-18 21:21:00 +02007152/*
7153 * Decode a byte string from a code page into unicode object with an error
7154 * handler.
7155 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007156 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007157 * UnicodeDecodeError exception and returns -1 on error.
7158 */
7159static int
7160decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007161 PyObject **v,
7162 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007163 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007164{
7165 const char *startin = in;
7166 const char *endin = in + size;
7167 const DWORD flags = decode_code_page_flags(code_page);
7168 /* Ideally, we should get reason from FormatMessage. This is the Windows
7169 2000 English version of the message. */
7170 const char *reason = "No mapping for the Unicode character exists "
7171 "in the target code page.";
7172 /* each step cannot decode more than 1 character, but a character can be
7173 represented as a surrogate pair */
7174 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007175 int insize;
7176 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007177 PyObject *errorHandler = NULL;
7178 PyObject *exc = NULL;
7179 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007180 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007181 DWORD err;
7182 int ret = -1;
7183
7184 assert(size > 0);
7185
7186 encoding = code_page_name(code_page, &encoding_obj);
7187 if (encoding == NULL)
7188 return -1;
7189
Victor Stinner7d00cc12014-03-17 23:08:06 +01007190 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007191 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7192 UnicodeDecodeError. */
7193 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7194 if (exc != NULL) {
7195 PyCodec_StrictErrors(exc);
7196 Py_CLEAR(exc);
7197 }
7198 goto error;
7199 }
7200
7201 if (*v == NULL) {
7202 /* Create unicode object */
7203 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7204 PyErr_NoMemory();
7205 goto error;
7206 }
Victor Stinnerab595942011-12-17 04:59:06 +01007207 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007208 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 if (*v == NULL)
7210 goto error;
7211 startout = PyUnicode_AS_UNICODE(*v);
7212 }
7213 else {
7214 /* Extend unicode object */
7215 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7216 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7217 PyErr_NoMemory();
7218 goto error;
7219 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007220 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 goto error;
7222 startout = PyUnicode_AS_UNICODE(*v) + n;
7223 }
7224
7225 /* Decode the byte string character per character */
7226 out = startout;
7227 while (in < endin)
7228 {
7229 /* Decode a character */
7230 insize = 1;
7231 do
7232 {
7233 outsize = MultiByteToWideChar(code_page, flags,
7234 in, insize,
7235 buffer, Py_ARRAY_LENGTH(buffer));
7236 if (outsize > 0)
7237 break;
7238 err = GetLastError();
7239 if (err != ERROR_NO_UNICODE_TRANSLATION
7240 && err != ERROR_INSUFFICIENT_BUFFER)
7241 {
7242 PyErr_SetFromWindowsErr(0);
7243 goto error;
7244 }
7245 insize++;
7246 }
7247 /* 4=maximum length of a UTF-8 sequence */
7248 while (insize <= 4 && (in + insize) <= endin);
7249
7250 if (outsize <= 0) {
7251 Py_ssize_t startinpos, endinpos, outpos;
7252
Victor Stinner7d00cc12014-03-17 23:08:06 +01007253 /* last character in partial decode? */
7254 if (in + insize >= endin && !final)
7255 break;
7256
Victor Stinner3a50e702011-10-18 21:21:00 +02007257 startinpos = in - startin;
7258 endinpos = startinpos + 1;
7259 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007260 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007261 errors, &errorHandler,
7262 encoding, reason,
7263 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007264 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 {
7266 goto error;
7267 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007268 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007269 }
7270 else {
7271 in += insize;
7272 memcpy(out, buffer, outsize * sizeof(wchar_t));
7273 out += outsize;
7274 }
7275 }
7276
7277 /* write a NUL character at the end */
7278 *out = 0;
7279
7280 /* Extend unicode object */
7281 outsize = out - startout;
7282 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007283 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007284 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007285 /* (in - startin) <= size and size is an int */
7286 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007287
7288error:
7289 Py_XDECREF(encoding_obj);
7290 Py_XDECREF(errorHandler);
7291 Py_XDECREF(exc);
7292 return ret;
7293}
7294
Victor Stinner3a50e702011-10-18 21:21:00 +02007295static PyObject *
7296decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007297 const char *s, Py_ssize_t size,
7298 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007299{
Victor Stinner76a31a62011-11-04 00:05:13 +01007300 PyObject *v = NULL;
7301 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007302
Victor Stinner3a50e702011-10-18 21:21:00 +02007303 if (code_page < 0) {
7304 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7305 return NULL;
7306 }
7307
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007308 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007310
Victor Stinner76a31a62011-11-04 00:05:13 +01007311 do
7312 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007313#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007314 if (size > INT_MAX) {
7315 chunk_size = INT_MAX;
7316 final = 0;
7317 done = 0;
7318 }
7319 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007321 {
7322 chunk_size = (int)size;
7323 final = (consumed == NULL);
7324 done = 1;
7325 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007326
Victor Stinner76a31a62011-11-04 00:05:13 +01007327 if (chunk_size == 0 && done) {
7328 if (v != NULL)
7329 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007330 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007331 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007332
Victor Stinner76a31a62011-11-04 00:05:13 +01007333 converted = decode_code_page_strict(code_page, &v,
7334 s, chunk_size);
7335 if (converted == -2)
7336 converted = decode_code_page_errors(code_page, &v,
7337 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007338 errors, final);
7339 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007340
7341 if (converted < 0) {
7342 Py_XDECREF(v);
7343 return NULL;
7344 }
7345
7346 if (consumed)
7347 *consumed += converted;
7348
7349 s += converted;
7350 size -= converted;
7351 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007352
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007353 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007354}
7355
Alexander Belopolsky40018472011-02-26 01:02:56 +00007356PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007357PyUnicode_DecodeCodePageStateful(int code_page,
7358 const char *s,
7359 Py_ssize_t size,
7360 const char *errors,
7361 Py_ssize_t *consumed)
7362{
7363 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7364}
7365
7366PyObject *
7367PyUnicode_DecodeMBCSStateful(const char *s,
7368 Py_ssize_t size,
7369 const char *errors,
7370 Py_ssize_t *consumed)
7371{
7372 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7373}
7374
7375PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007376PyUnicode_DecodeMBCS(const char *s,
7377 Py_ssize_t size,
7378 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007379{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007380 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7381}
7382
Victor Stinner3a50e702011-10-18 21:21:00 +02007383static DWORD
7384encode_code_page_flags(UINT code_page, const char *errors)
7385{
7386 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007387 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007388 }
7389 else if (code_page == CP_UTF7) {
7390 /* CP_UTF7 only supports flags=0 */
7391 return 0;
7392 }
7393 else {
7394 if (errors != NULL && strcmp(errors, "replace") == 0)
7395 return 0;
7396 else
7397 return WC_NO_BEST_FIT_CHARS;
7398 }
7399}
7400
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007401/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007402 * Encode a Unicode string to a Windows code page into a byte string in strict
7403 * mode.
7404 *
7405 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007406 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007407 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007408static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007409encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007410 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007411 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007412{
Victor Stinner554f3f02010-06-16 23:33:54 +00007413 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007414 BOOL *pusedDefaultChar = &usedDefaultChar;
7415 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007416 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007417 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007418 const DWORD flags = encode_code_page_flags(code_page, NULL);
7419 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007420 /* Create a substring so that we can get the UTF-16 representation
7421 of just the slice under consideration. */
7422 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007423
Martin v. Löwis3d325192011-11-04 18:23:06 +01007424 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007425
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007427 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007428 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007429 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007430
Victor Stinner2fc507f2011-11-04 20:06:39 +01007431 substring = PyUnicode_Substring(unicode, offset, offset+len);
7432 if (substring == NULL)
7433 return -1;
7434 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7435 if (p == NULL) {
7436 Py_DECREF(substring);
7437 return -1;
7438 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007439 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007440
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007441 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007442 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007443 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007444 NULL, 0,
7445 NULL, pusedDefaultChar);
7446 if (outsize <= 0)
7447 goto error;
7448 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007449 if (pusedDefaultChar && *pusedDefaultChar) {
7450 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007451 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007452 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007453
Victor Stinner3a50e702011-10-18 21:21:00 +02007454 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007456 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007457 if (*outbytes == NULL) {
7458 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007460 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007462 }
7463 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007465 const Py_ssize_t n = PyBytes_Size(*outbytes);
7466 if (outsize > PY_SSIZE_T_MAX - n) {
7467 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007468 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007470 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007471 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7472 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007474 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007476 }
7477
7478 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007479 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007480 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007481 out, outsize,
7482 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007483 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 if (outsize <= 0)
7485 goto error;
7486 if (pusedDefaultChar && *pusedDefaultChar)
7487 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007488 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007489
Victor Stinner3a50e702011-10-18 21:21:00 +02007490error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007491 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7493 return -2;
7494 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007495 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007496}
7497
Victor Stinner3a50e702011-10-18 21:21:00 +02007498/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007499 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007500 * error handler.
7501 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007502 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 * -1 on other error.
7504 */
7505static int
7506encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007507 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007508 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007509{
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007511 Py_ssize_t pos = unicode_offset;
7512 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007513 /* Ideally, we should get reason from FormatMessage. This is the Windows
7514 2000 English version of the message. */
7515 const char *reason = "invalid character";
7516 /* 4=maximum length of a UTF-8 sequence */
7517 char buffer[4];
7518 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7519 Py_ssize_t outsize;
7520 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007521 PyObject *errorHandler = NULL;
7522 PyObject *exc = NULL;
7523 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007524 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007525 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 PyObject *rep;
7527 int ret = -1;
7528
7529 assert(insize > 0);
7530
7531 encoding = code_page_name(code_page, &encoding_obj);
7532 if (encoding == NULL)
7533 return -1;
7534
7535 if (errors == NULL || strcmp(errors, "strict") == 0) {
7536 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7537 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007538 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007539 if (exc != NULL) {
7540 PyCodec_StrictErrors(exc);
7541 Py_DECREF(exc);
7542 }
7543 Py_XDECREF(encoding_obj);
7544 return -1;
7545 }
7546
7547 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7548 pusedDefaultChar = &usedDefaultChar;
7549 else
7550 pusedDefaultChar = NULL;
7551
7552 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7553 PyErr_NoMemory();
7554 goto error;
7555 }
7556 outsize = insize * Py_ARRAY_LENGTH(buffer);
7557
7558 if (*outbytes == NULL) {
7559 /* Create string object */
7560 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7561 if (*outbytes == NULL)
7562 goto error;
7563 out = PyBytes_AS_STRING(*outbytes);
7564 }
7565 else {
7566 /* Extend string object */
7567 Py_ssize_t n = PyBytes_Size(*outbytes);
7568 if (n > PY_SSIZE_T_MAX - outsize) {
7569 PyErr_NoMemory();
7570 goto error;
7571 }
7572 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7573 goto error;
7574 out = PyBytes_AS_STRING(*outbytes) + n;
7575 }
7576
7577 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007578 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007579 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007580 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7581 wchar_t chars[2];
7582 int charsize;
7583 if (ch < 0x10000) {
7584 chars[0] = (wchar_t)ch;
7585 charsize = 1;
7586 }
7587 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007588 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7589 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007590 charsize = 2;
7591 }
7592
Victor Stinner3a50e702011-10-18 21:21:00 +02007593 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007594 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007595 buffer, Py_ARRAY_LENGTH(buffer),
7596 NULL, pusedDefaultChar);
7597 if (outsize > 0) {
7598 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7599 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007600 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007601 memcpy(out, buffer, outsize);
7602 out += outsize;
7603 continue;
7604 }
7605 }
7606 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7607 PyErr_SetFromWindowsErr(0);
7608 goto error;
7609 }
7610
Victor Stinner3a50e702011-10-18 21:21:00 +02007611 rep = unicode_encode_call_errorhandler(
7612 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007613 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007614 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007615 if (rep == NULL)
7616 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007617 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007618
7619 if (PyBytes_Check(rep)) {
7620 outsize = PyBytes_GET_SIZE(rep);
7621 if (outsize != 1) {
7622 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7623 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7624 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7625 Py_DECREF(rep);
7626 goto error;
7627 }
7628 out = PyBytes_AS_STRING(*outbytes) + offset;
7629 }
7630 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7631 out += outsize;
7632 }
7633 else {
7634 Py_ssize_t i;
7635 enum PyUnicode_Kind kind;
7636 void *data;
7637
Benjamin Petersonbac79492012-01-14 13:34:47 -05007638 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007639 Py_DECREF(rep);
7640 goto error;
7641 }
7642
7643 outsize = PyUnicode_GET_LENGTH(rep);
7644 if (outsize != 1) {
7645 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7646 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7647 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7648 Py_DECREF(rep);
7649 goto error;
7650 }
7651 out = PyBytes_AS_STRING(*outbytes) + offset;
7652 }
7653 kind = PyUnicode_KIND(rep);
7654 data = PyUnicode_DATA(rep);
7655 for (i=0; i < outsize; i++) {
7656 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7657 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007658 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007659 encoding, unicode,
7660 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007661 "unable to encode error handler result to ASCII");
7662 Py_DECREF(rep);
7663 goto error;
7664 }
7665 *out = (unsigned char)ch;
7666 out++;
7667 }
7668 }
7669 Py_DECREF(rep);
7670 }
7671 /* write a NUL byte */
7672 *out = 0;
7673 outsize = out - PyBytes_AS_STRING(*outbytes);
7674 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7675 if (_PyBytes_Resize(outbytes, outsize) < 0)
7676 goto error;
7677 ret = 0;
7678
7679error:
7680 Py_XDECREF(encoding_obj);
7681 Py_XDECREF(errorHandler);
7682 Py_XDECREF(exc);
7683 return ret;
7684}
7685
Victor Stinner3a50e702011-10-18 21:21:00 +02007686static PyObject *
7687encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007688 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007689 const char *errors)
7690{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007691 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007692 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007693 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007694 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007695
Victor Stinner29dacf22015-01-26 16:41:32 +01007696 if (!PyUnicode_Check(unicode)) {
7697 PyErr_BadArgument();
7698 return NULL;
7699 }
7700
Benjamin Petersonbac79492012-01-14 13:34:47 -05007701 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007702 return NULL;
7703 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007704
Victor Stinner3a50e702011-10-18 21:21:00 +02007705 if (code_page < 0) {
7706 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7707 return NULL;
7708 }
7709
Martin v. Löwis3d325192011-11-04 18:23:06 +01007710 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007711 return PyBytes_FromStringAndSize(NULL, 0);
7712
Victor Stinner7581cef2011-11-03 22:32:33 +01007713 offset = 0;
7714 do
7715 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007716#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007717 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007718 chunks. */
7719 if (len > INT_MAX/2) {
7720 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007721 done = 0;
7722 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007723 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007724#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007725 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007726 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007727 done = 1;
7728 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007729
Victor Stinner76a31a62011-11-04 00:05:13 +01007730 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007731 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007732 errors);
7733 if (ret == -2)
7734 ret = encode_code_page_errors(code_page, &outbytes,
7735 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007736 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007737 if (ret < 0) {
7738 Py_XDECREF(outbytes);
7739 return NULL;
7740 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007741
Victor Stinner7581cef2011-11-03 22:32:33 +01007742 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007743 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007744 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007745
Victor Stinner3a50e702011-10-18 21:21:00 +02007746 return outbytes;
7747}
7748
7749PyObject *
7750PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7751 Py_ssize_t size,
7752 const char *errors)
7753{
Victor Stinner7581cef2011-11-03 22:32:33 +01007754 PyObject *unicode, *res;
7755 unicode = PyUnicode_FromUnicode(p, size);
7756 if (unicode == NULL)
7757 return NULL;
7758 res = encode_code_page(CP_ACP, unicode, errors);
7759 Py_DECREF(unicode);
7760 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007761}
7762
7763PyObject *
7764PyUnicode_EncodeCodePage(int code_page,
7765 PyObject *unicode,
7766 const char *errors)
7767{
Victor Stinner7581cef2011-11-03 22:32:33 +01007768 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007769}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007770
Alexander Belopolsky40018472011-02-26 01:02:56 +00007771PyObject *
7772PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007773{
Victor Stinner7581cef2011-11-03 22:32:33 +01007774 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007775}
7776
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007777#undef NEED_RETRY
7778
Victor Stinner99b95382011-07-04 14:23:54 +02007779#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007780
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781/* --- Character Mapping Codec -------------------------------------------- */
7782
Victor Stinnerfb161b12013-04-18 01:44:27 +02007783static int
7784charmap_decode_string(const char *s,
7785 Py_ssize_t size,
7786 PyObject *mapping,
7787 const char *errors,
7788 _PyUnicodeWriter *writer)
7789{
7790 const char *starts = s;
7791 const char *e;
7792 Py_ssize_t startinpos, endinpos;
7793 PyObject *errorHandler = NULL, *exc = NULL;
7794 Py_ssize_t maplen;
7795 enum PyUnicode_Kind mapkind;
7796 void *mapdata;
7797 Py_UCS4 x;
7798 unsigned char ch;
7799
7800 if (PyUnicode_READY(mapping) == -1)
7801 return -1;
7802
7803 maplen = PyUnicode_GET_LENGTH(mapping);
7804 mapdata = PyUnicode_DATA(mapping);
7805 mapkind = PyUnicode_KIND(mapping);
7806
7807 e = s + size;
7808
7809 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7810 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7811 * is disabled in encoding aliases, latin1 is preferred because
7812 * its implementation is faster. */
7813 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7814 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7815 Py_UCS4 maxchar = writer->maxchar;
7816
7817 assert (writer->kind == PyUnicode_1BYTE_KIND);
7818 while (s < e) {
7819 ch = *s;
7820 x = mapdata_ucs1[ch];
7821 if (x > maxchar) {
7822 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7823 goto onError;
7824 maxchar = writer->maxchar;
7825 outdata = (Py_UCS1 *)writer->data;
7826 }
7827 outdata[writer->pos] = x;
7828 writer->pos++;
7829 ++s;
7830 }
7831 return 0;
7832 }
7833
7834 while (s < e) {
7835 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7836 enum PyUnicode_Kind outkind = writer->kind;
7837 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7838 if (outkind == PyUnicode_1BYTE_KIND) {
7839 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7840 Py_UCS4 maxchar = writer->maxchar;
7841 while (s < e) {
7842 ch = *s;
7843 x = mapdata_ucs2[ch];
7844 if (x > maxchar)
7845 goto Error;
7846 outdata[writer->pos] = x;
7847 writer->pos++;
7848 ++s;
7849 }
7850 break;
7851 }
7852 else if (outkind == PyUnicode_2BYTE_KIND) {
7853 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7854 while (s < e) {
7855 ch = *s;
7856 x = mapdata_ucs2[ch];
7857 if (x == 0xFFFE)
7858 goto Error;
7859 outdata[writer->pos] = x;
7860 writer->pos++;
7861 ++s;
7862 }
7863 break;
7864 }
7865 }
7866 ch = *s;
7867
7868 if (ch < maplen)
7869 x = PyUnicode_READ(mapkind, mapdata, ch);
7870 else
7871 x = 0xfffe; /* invalid value */
7872Error:
7873 if (x == 0xfffe)
7874 {
7875 /* undefined mapping */
7876 startinpos = s-starts;
7877 endinpos = startinpos+1;
7878 if (unicode_decode_call_errorhandler_writer(
7879 errors, &errorHandler,
7880 "charmap", "character maps to <undefined>",
7881 &starts, &e, &startinpos, &endinpos, &exc, &s,
7882 writer)) {
7883 goto onError;
7884 }
7885 continue;
7886 }
7887
7888 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7889 goto onError;
7890 ++s;
7891 }
7892 Py_XDECREF(errorHandler);
7893 Py_XDECREF(exc);
7894 return 0;
7895
7896onError:
7897 Py_XDECREF(errorHandler);
7898 Py_XDECREF(exc);
7899 return -1;
7900}
7901
7902static int
7903charmap_decode_mapping(const char *s,
7904 Py_ssize_t size,
7905 PyObject *mapping,
7906 const char *errors,
7907 _PyUnicodeWriter *writer)
7908{
7909 const char *starts = s;
7910 const char *e;
7911 Py_ssize_t startinpos, endinpos;
7912 PyObject *errorHandler = NULL, *exc = NULL;
7913 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007914 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007915
7916 e = s + size;
7917
7918 while (s < e) {
7919 ch = *s;
7920
7921 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7922 key = PyLong_FromLong((long)ch);
7923 if (key == NULL)
7924 goto onError;
7925
7926 item = PyObject_GetItem(mapping, key);
7927 Py_DECREF(key);
7928 if (item == NULL) {
7929 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7930 /* No mapping found means: mapping is undefined. */
7931 PyErr_Clear();
7932 goto Undefined;
7933 } else
7934 goto onError;
7935 }
7936
7937 /* Apply mapping */
7938 if (item == Py_None)
7939 goto Undefined;
7940 if (PyLong_Check(item)) {
7941 long value = PyLong_AS_LONG(item);
7942 if (value == 0xFFFE)
7943 goto Undefined;
7944 if (value < 0 || value > MAX_UNICODE) {
7945 PyErr_Format(PyExc_TypeError,
7946 "character mapping must be in range(0x%lx)",
7947 (unsigned long)MAX_UNICODE + 1);
7948 goto onError;
7949 }
7950
7951 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7952 goto onError;
7953 }
7954 else if (PyUnicode_Check(item)) {
7955 if (PyUnicode_READY(item) == -1)
7956 goto onError;
7957 if (PyUnicode_GET_LENGTH(item) == 1) {
7958 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7959 if (value == 0xFFFE)
7960 goto Undefined;
7961 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7962 goto onError;
7963 }
7964 else {
7965 writer->overallocate = 1;
7966 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7967 goto onError;
7968 }
7969 }
7970 else {
7971 /* wrong return value */
7972 PyErr_SetString(PyExc_TypeError,
7973 "character mapping must return integer, None or str");
7974 goto onError;
7975 }
7976 Py_CLEAR(item);
7977 ++s;
7978 continue;
7979
7980Undefined:
7981 /* undefined mapping */
7982 Py_CLEAR(item);
7983 startinpos = s-starts;
7984 endinpos = startinpos+1;
7985 if (unicode_decode_call_errorhandler_writer(
7986 errors, &errorHandler,
7987 "charmap", "character maps to <undefined>",
7988 &starts, &e, &startinpos, &endinpos, &exc, &s,
7989 writer)) {
7990 goto onError;
7991 }
7992 }
7993 Py_XDECREF(errorHandler);
7994 Py_XDECREF(exc);
7995 return 0;
7996
7997onError:
7998 Py_XDECREF(item);
7999 Py_XDECREF(errorHandler);
8000 Py_XDECREF(exc);
8001 return -1;
8002}
8003
Alexander Belopolsky40018472011-02-26 01:02:56 +00008004PyObject *
8005PyUnicode_DecodeCharmap(const char *s,
8006 Py_ssize_t size,
8007 PyObject *mapping,
8008 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008010 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008011
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 /* Default to Latin-1 */
8013 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008017 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008018 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008019 writer.min_length = size;
8020 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008022
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008023 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008024 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8025 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008026 }
8027 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008028 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8029 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008031 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008032
Benjamin Peterson29060642009-01-31 22:14:21 +00008033 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008034 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 return NULL;
8036}
8037
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008038/* Charmap encoding: the lookup table */
8039
Alexander Belopolsky40018472011-02-26 01:02:56 +00008040struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 PyObject_HEAD
8042 unsigned char level1[32];
8043 int count2, count3;
8044 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008045};
8046
8047static PyObject*
8048encoding_map_size(PyObject *obj, PyObject* args)
8049{
8050 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008051 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053}
8054
8055static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008056 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 PyDoc_STR("Return the size (in bytes) of this object") },
8058 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008059};
8060
8061static void
8062encoding_map_dealloc(PyObject* o)
8063{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008064 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065}
8066
8067static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008068 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 "EncodingMap", /*tp_name*/
8070 sizeof(struct encoding_map), /*tp_basicsize*/
8071 0, /*tp_itemsize*/
8072 /* methods */
8073 encoding_map_dealloc, /*tp_dealloc*/
8074 0, /*tp_print*/
8075 0, /*tp_getattr*/
8076 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008077 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 0, /*tp_repr*/
8079 0, /*tp_as_number*/
8080 0, /*tp_as_sequence*/
8081 0, /*tp_as_mapping*/
8082 0, /*tp_hash*/
8083 0, /*tp_call*/
8084 0, /*tp_str*/
8085 0, /*tp_getattro*/
8086 0, /*tp_setattro*/
8087 0, /*tp_as_buffer*/
8088 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8089 0, /*tp_doc*/
8090 0, /*tp_traverse*/
8091 0, /*tp_clear*/
8092 0, /*tp_richcompare*/
8093 0, /*tp_weaklistoffset*/
8094 0, /*tp_iter*/
8095 0, /*tp_iternext*/
8096 encoding_map_methods, /*tp_methods*/
8097 0, /*tp_members*/
8098 0, /*tp_getset*/
8099 0, /*tp_base*/
8100 0, /*tp_dict*/
8101 0, /*tp_descr_get*/
8102 0, /*tp_descr_set*/
8103 0, /*tp_dictoffset*/
8104 0, /*tp_init*/
8105 0, /*tp_alloc*/
8106 0, /*tp_new*/
8107 0, /*tp_free*/
8108 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008109};
8110
8111PyObject*
8112PyUnicode_BuildEncodingMap(PyObject* string)
8113{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008114 PyObject *result;
8115 struct encoding_map *mresult;
8116 int i;
8117 int need_dict = 0;
8118 unsigned char level1[32];
8119 unsigned char level2[512];
8120 unsigned char *mlevel1, *mlevel2, *mlevel3;
8121 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008122 int kind;
8123 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008124 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008125 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008126
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008127 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008128 PyErr_BadArgument();
8129 return NULL;
8130 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008131 kind = PyUnicode_KIND(string);
8132 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008133 length = PyUnicode_GET_LENGTH(string);
8134 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008135 memset(level1, 0xFF, sizeof level1);
8136 memset(level2, 0xFF, sizeof level2);
8137
8138 /* If there isn't a one-to-one mapping of NULL to \0,
8139 or if there are non-BMP characters, we need to use
8140 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008141 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008142 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008143 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008144 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008145 ch = PyUnicode_READ(kind, data, i);
8146 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008147 need_dict = 1;
8148 break;
8149 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008150 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008151 /* unmapped character */
8152 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008153 l1 = ch >> 11;
8154 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008155 if (level1[l1] == 0xFF)
8156 level1[l1] = count2++;
8157 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008158 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008159 }
8160
8161 if (count2 >= 0xFF || count3 >= 0xFF)
8162 need_dict = 1;
8163
8164 if (need_dict) {
8165 PyObject *result = PyDict_New();
8166 PyObject *key, *value;
8167 if (!result)
8168 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008169 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008170 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008171 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008172 if (!key || !value)
8173 goto failed1;
8174 if (PyDict_SetItem(result, key, value) == -1)
8175 goto failed1;
8176 Py_DECREF(key);
8177 Py_DECREF(value);
8178 }
8179 return result;
8180 failed1:
8181 Py_XDECREF(key);
8182 Py_XDECREF(value);
8183 Py_DECREF(result);
8184 return NULL;
8185 }
8186
8187 /* Create a three-level trie */
8188 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8189 16*count2 + 128*count3 - 1);
8190 if (!result)
8191 return PyErr_NoMemory();
8192 PyObject_Init(result, &EncodingMapType);
8193 mresult = (struct encoding_map*)result;
8194 mresult->count2 = count2;
8195 mresult->count3 = count3;
8196 mlevel1 = mresult->level1;
8197 mlevel2 = mresult->level23;
8198 mlevel3 = mresult->level23 + 16*count2;
8199 memcpy(mlevel1, level1, 32);
8200 memset(mlevel2, 0xFF, 16*count2);
8201 memset(mlevel3, 0, 128*count3);
8202 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008203 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008204 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008205 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8206 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008207 /* unmapped character */
8208 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008209 o1 = ch>>11;
8210 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008211 i2 = 16*mlevel1[o1] + o2;
8212 if (mlevel2[i2] == 0xFF)
8213 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008214 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008215 i3 = 128*mlevel2[i2] + o3;
8216 mlevel3[i3] = i;
8217 }
8218 return result;
8219}
8220
8221static int
Victor Stinner22168992011-11-20 17:09:18 +01008222encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008223{
8224 struct encoding_map *map = (struct encoding_map*)mapping;
8225 int l1 = c>>11;
8226 int l2 = (c>>7) & 0xF;
8227 int l3 = c & 0x7F;
8228 int i;
8229
Victor Stinner22168992011-11-20 17:09:18 +01008230 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008232 if (c == 0)
8233 return 0;
8234 /* level 1*/
8235 i = map->level1[l1];
8236 if (i == 0xFF) {
8237 return -1;
8238 }
8239 /* level 2*/
8240 i = map->level23[16*i+l2];
8241 if (i == 0xFF) {
8242 return -1;
8243 }
8244 /* level 3 */
8245 i = map->level23[16*map->count2 + 128*i + l3];
8246 if (i == 0) {
8247 return -1;
8248 }
8249 return i;
8250}
8251
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008252/* Lookup the character ch in the mapping. If the character
8253 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008254 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008255static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008256charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257{
Christian Heimes217cfd12007-12-02 14:31:20 +00008258 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008259 PyObject *x;
8260
8261 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008263 x = PyObject_GetItem(mapping, w);
8264 Py_DECREF(w);
8265 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8267 /* No mapping found means: mapping is undefined. */
8268 PyErr_Clear();
8269 x = Py_None;
8270 Py_INCREF(x);
8271 return x;
8272 } else
8273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008275 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008277 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 long value = PyLong_AS_LONG(x);
8279 if (value < 0 || value > 255) {
8280 PyErr_SetString(PyExc_TypeError,
8281 "character mapping must be in range(256)");
8282 Py_DECREF(x);
8283 return NULL;
8284 }
8285 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008287 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008288 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 /* wrong return value */
8291 PyErr_Format(PyExc_TypeError,
8292 "character mapping must return integer, bytes or None, not %.400s",
8293 x->ob_type->tp_name);
8294 Py_DECREF(x);
8295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296 }
8297}
8298
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008299static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008300charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008301{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008302 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8303 /* exponentially overallocate to minimize reallocations */
8304 if (requiredsize < 2*outsize)
8305 requiredsize = 2*outsize;
8306 if (_PyBytes_Resize(outobj, requiredsize))
8307 return -1;
8308 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008309}
8310
Benjamin Peterson14339b62009-01-31 16:36:08 +00008311typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008313} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008314/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008315 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008316 space is available. Return a new reference to the object that
8317 was put in the output buffer, or Py_None, if the mapping was undefined
8318 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008319 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008320static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008321charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008322 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008323{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008324 PyObject *rep;
8325 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008326 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008327
Christian Heimes90aa7642007-12-19 02:45:37 +00008328 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008329 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008330 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008331 if (res == -1)
8332 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 if (outsize<requiredsize)
8334 if (charmapencode_resize(outobj, outpos, requiredsize))
8335 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008336 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 outstart[(*outpos)++] = (char)res;
8338 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008339 }
8340
8341 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008342 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008344 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 Py_DECREF(rep);
8346 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008347 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 if (PyLong_Check(rep)) {
8349 Py_ssize_t requiredsize = *outpos+1;
8350 if (outsize<requiredsize)
8351 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8352 Py_DECREF(rep);
8353 return enc_EXCEPTION;
8354 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008355 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008357 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 else {
8359 const char *repchars = PyBytes_AS_STRING(rep);
8360 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8361 Py_ssize_t requiredsize = *outpos+repsize;
8362 if (outsize<requiredsize)
8363 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8364 Py_DECREF(rep);
8365 return enc_EXCEPTION;
8366 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008367 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 memcpy(outstart + *outpos, repchars, repsize);
8369 *outpos += repsize;
8370 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008372 Py_DECREF(rep);
8373 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374}
8375
8376/* handle an error in PyUnicode_EncodeCharmap
8377 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008378static int
8379charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008380 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008382 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008383 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008384{
8385 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008386 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008387 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008388 enum PyUnicode_Kind kind;
8389 void *data;
8390 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008392 Py_ssize_t collstartpos = *inpos;
8393 Py_ssize_t collendpos = *inpos+1;
8394 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395 char *encoding = "charmap";
8396 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008397 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008398 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008399 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400
Benjamin Petersonbac79492012-01-14 13:34:47 -05008401 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008402 return -1;
8403 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404 /* find all unencodable characters */
8405 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008406 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008407 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008408 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008409 val = encoding_map_lookup(ch, mapping);
8410 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 break;
8412 ++collendpos;
8413 continue;
8414 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008415
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008416 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8417 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 if (rep==NULL)
8419 return -1;
8420 else if (rep!=Py_None) {
8421 Py_DECREF(rep);
8422 break;
8423 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008424 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008426 }
8427 /* cache callback name lookup
8428 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008429 if (*error_handler == _Py_ERROR_UNKNOWN)
8430 *error_handler = get_error_handler(errors);
8431
8432 switch (*error_handler) {
8433 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008434 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008435 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008436
8437 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008438 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 x = charmapencode_output('?', mapping, res, respos);
8440 if (x==enc_EXCEPTION) {
8441 return -1;
8442 }
8443 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008444 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 return -1;
8446 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008447 }
8448 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008449 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008450 *inpos = collendpos;
8451 break;
Victor Stinner50149202015-09-22 00:26:54 +02008452
8453 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008454 /* generate replacement (temporarily (mis)uses p) */
8455 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 char buffer[2+29+1+1];
8457 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008458 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 for (cp = buffer; *cp; ++cp) {
8460 x = charmapencode_output(*cp, mapping, res, respos);
8461 if (x==enc_EXCEPTION)
8462 return -1;
8463 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008464 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 return -1;
8466 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008467 }
8468 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008469 *inpos = collendpos;
8470 break;
Victor Stinner50149202015-09-22 00:26:54 +02008471
Benjamin Peterson14339b62009-01-31 16:36:08 +00008472 default:
Victor Stinner50149202015-09-22 00:26:54 +02008473 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008474 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008476 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008478 if (PyBytes_Check(repunicode)) {
8479 /* Directly copy bytes result to output. */
8480 Py_ssize_t outsize = PyBytes_Size(*res);
8481 Py_ssize_t requiredsize;
8482 repsize = PyBytes_Size(repunicode);
8483 requiredsize = *respos + repsize;
8484 if (requiredsize > outsize)
8485 /* Make room for all additional bytes. */
8486 if (charmapencode_resize(res, respos, requiredsize)) {
8487 Py_DECREF(repunicode);
8488 return -1;
8489 }
8490 memcpy(PyBytes_AsString(*res) + *respos,
8491 PyBytes_AsString(repunicode), repsize);
8492 *respos += repsize;
8493 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008494 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008495 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008496 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008497 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008498 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008499 Py_DECREF(repunicode);
8500 return -1;
8501 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008502 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008503 data = PyUnicode_DATA(repunicode);
8504 kind = PyUnicode_KIND(repunicode);
8505 for (index = 0; index < repsize; index++) {
8506 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8507 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008509 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 return -1;
8511 }
8512 else if (x==enc_FAILED) {
8513 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008514 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 return -1;
8516 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008517 }
8518 *inpos = newpos;
8519 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 }
8521 return 0;
8522}
8523
Alexander Belopolsky40018472011-02-26 01:02:56 +00008524PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008525_PyUnicode_EncodeCharmap(PyObject *unicode,
8526 PyObject *mapping,
8527 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008529 /* output object */
8530 PyObject *res = NULL;
8531 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008532 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008533 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008535 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008536 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008538 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008539 void *data;
8540 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541
Benjamin Petersonbac79492012-01-14 13:34:47 -05008542 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008543 return NULL;
8544 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008545 data = PyUnicode_DATA(unicode);
8546 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008547
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548 /* Default to Latin-1 */
8549 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008550 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008552 /* allocate enough for a simple encoding without
8553 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008554 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008555 if (res == NULL)
8556 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008557 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008561 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008563 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 if (x==enc_EXCEPTION) /* error */
8565 goto onError;
8566 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008567 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008569 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 &res, &respos)) {
8571 goto onError;
8572 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008573 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 else
8575 /* done with this character => adjust input position */
8576 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008579 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008580 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008581 if (_PyBytes_Resize(&res, respos) < 0)
8582 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008583
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008584 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008585 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008586 return res;
8587
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008589 Py_XDECREF(res);
8590 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008591 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592 return NULL;
8593}
8594
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008595/* Deprecated */
8596PyObject *
8597PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8598 Py_ssize_t size,
8599 PyObject *mapping,
8600 const char *errors)
8601{
8602 PyObject *result;
8603 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8604 if (unicode == NULL)
8605 return NULL;
8606 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8607 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008608 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008609}
8610
Alexander Belopolsky40018472011-02-26 01:02:56 +00008611PyObject *
8612PyUnicode_AsCharmapString(PyObject *unicode,
8613 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614{
8615 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 PyErr_BadArgument();
8617 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008619 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620}
8621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008622/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008623static void
8624make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008626 Py_ssize_t startpos, Py_ssize_t endpos,
8627 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008629 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 *exceptionObject = _PyUnicodeTranslateError_Create(
8631 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632 }
8633 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8635 goto onError;
8636 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8637 goto onError;
8638 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8639 goto onError;
8640 return;
8641 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008642 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643 }
8644}
8645
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008646/* error handling callback helper:
8647 build arguments, call the callback and check the arguments,
8648 put the result into newpos and return the replacement string, which
8649 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008650static PyObject *
8651unicode_translate_call_errorhandler(const char *errors,
8652 PyObject **errorHandler,
8653 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008655 Py_ssize_t startpos, Py_ssize_t endpos,
8656 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008657{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02008658 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008659
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008660 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661 PyObject *restuple;
8662 PyObject *resunicode;
8663
8664 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668 }
8669
8670 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008672 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674
8675 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008677 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008680 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 Py_DECREF(restuple);
8682 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 }
8684 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 &resunicode, &i_newpos)) {
8686 Py_DECREF(restuple);
8687 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008688 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008689 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008691 else
8692 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008694 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 Py_DECREF(restuple);
8696 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008697 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008698 Py_INCREF(resunicode);
8699 Py_DECREF(restuple);
8700 return resunicode;
8701}
8702
8703/* Lookup the character ch in the mapping and put the result in result,
8704 which must be decrefed by the caller.
8705 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008706static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008708{
Christian Heimes217cfd12007-12-02 14:31:20 +00008709 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008710 PyObject *x;
8711
8712 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714 x = PyObject_GetItem(mapping, w);
8715 Py_DECREF(w);
8716 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8718 /* No mapping found means: use 1:1 mapping. */
8719 PyErr_Clear();
8720 *result = NULL;
8721 return 0;
8722 } else
8723 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008724 }
8725 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 *result = x;
8727 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008728 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008729 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008731 if (value < 0 || value > MAX_UNICODE) {
8732 PyErr_Format(PyExc_ValueError,
8733 "character mapping must be in range(0x%x)",
8734 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 Py_DECREF(x);
8736 return -1;
8737 }
8738 *result = x;
8739 return 0;
8740 }
8741 else if (PyUnicode_Check(x)) {
8742 *result = x;
8743 return 0;
8744 }
8745 else {
8746 /* wrong return value */
8747 PyErr_SetString(PyExc_TypeError,
8748 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008749 Py_DECREF(x);
8750 return -1;
8751 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008752}
Victor Stinner1194ea02014-04-04 19:37:40 +02008753
8754/* lookup the character, write the result into the writer.
8755 Return 1 if the result was written into the writer, return 0 if the mapping
8756 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008757static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008758charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8759 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008760{
Victor Stinner1194ea02014-04-04 19:37:40 +02008761 PyObject *item;
8762
8763 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008765
8766 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008768 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008769 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008771 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008772 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008773
8774 if (item == Py_None) {
8775 Py_DECREF(item);
8776 return 0;
8777 }
8778
8779 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008780 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8781 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8782 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008783 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8784 Py_DECREF(item);
8785 return -1;
8786 }
8787 Py_DECREF(item);
8788 return 1;
8789 }
8790
8791 if (!PyUnicode_Check(item)) {
8792 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008793 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008794 }
8795
8796 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8797 Py_DECREF(item);
8798 return -1;
8799 }
8800
8801 Py_DECREF(item);
8802 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008803}
8804
Victor Stinner89a76ab2014-04-05 11:44:04 +02008805static int
8806unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8807 Py_UCS1 *translate)
8808{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008809 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008810 int ret = 0;
8811
Victor Stinner89a76ab2014-04-05 11:44:04 +02008812 if (charmaptranslate_lookup(ch, mapping, &item)) {
8813 return -1;
8814 }
8815
8816 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008817 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008818 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008819 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008820 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008821 /* not found => default to 1:1 mapping */
8822 translate[ch] = ch;
8823 return 1;
8824 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008825 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008826 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008827 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8828 used it */
8829 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008830 /* invalid character or character outside ASCII:
8831 skip the fast translate */
8832 goto exit;
8833 }
8834 translate[ch] = (Py_UCS1)replace;
8835 }
8836 else if (PyUnicode_Check(item)) {
8837 Py_UCS4 replace;
8838
8839 if (PyUnicode_READY(item) == -1) {
8840 Py_DECREF(item);
8841 return -1;
8842 }
8843 if (PyUnicode_GET_LENGTH(item) != 1)
8844 goto exit;
8845
8846 replace = PyUnicode_READ_CHAR(item, 0);
8847 if (replace > 127)
8848 goto exit;
8849 translate[ch] = (Py_UCS1)replace;
8850 }
8851 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008852 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008853 goto exit;
8854 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008855 ret = 1;
8856
Benjamin Peterson1365de72014-04-07 20:15:41 -04008857 exit:
8858 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008859 return ret;
8860}
8861
8862/* Fast path for ascii => ascii translation. Return 1 if the whole string
8863 was translated into writer, return 0 if the input string was partially
8864 translated into writer, raise an exception and return -1 on error. */
8865static int
8866unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008867 _PyUnicodeWriter *writer, int ignore,
8868 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008869{
Victor Stinner872b2912014-04-05 14:27:07 +02008870 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008871 Py_ssize_t len;
8872 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008873 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008874
Victor Stinner89a76ab2014-04-05 11:44:04 +02008875 len = PyUnicode_GET_LENGTH(input);
8876
Victor Stinner872b2912014-04-05 14:27:07 +02008877 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008878
8879 in = PyUnicode_1BYTE_DATA(input);
8880 end = in + len;
8881
8882 assert(PyUnicode_IS_ASCII(writer->buffer));
8883 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8884 out = PyUnicode_1BYTE_DATA(writer->buffer);
8885
Victor Stinner872b2912014-04-05 14:27:07 +02008886 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008887 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008888 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008889 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008890 int translate = unicode_fast_translate_lookup(mapping, ch,
8891 ascii_table);
8892 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008893 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008894 if (translate == 0)
8895 goto exit;
8896 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008897 }
Victor Stinner872b2912014-04-05 14:27:07 +02008898 if (ch2 == 0xfe) {
8899 if (ignore)
8900 continue;
8901 goto exit;
8902 }
8903 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008904 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008905 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008906 }
Victor Stinner872b2912014-04-05 14:27:07 +02008907 res = 1;
8908
8909exit:
8910 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008911 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008912 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008913}
8914
Victor Stinner3222da22015-10-01 22:07:32 +02008915static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916_PyUnicode_TranslateCharmap(PyObject *input,
8917 PyObject *mapping,
8918 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008921 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922 Py_ssize_t size, i;
8923 int kind;
8924 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008925 _PyUnicodeWriter writer;
8926 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008927 char *reason = "character maps to <undefined>";
8928 PyObject *errorHandler = NULL;
8929 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008930 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008931 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008932
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 PyErr_BadArgument();
8935 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008938 if (PyUnicode_READY(input) == -1)
8939 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008940 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008941 kind = PyUnicode_KIND(input);
8942 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008944 if (size == 0)
8945 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008947 /* allocate enough for a simple 1:1 translation without
8948 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008949 _PyUnicodeWriter_Init(&writer);
8950 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008951 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952
Victor Stinner872b2912014-04-05 14:27:07 +02008953 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8954
Victor Stinner33798672016-03-01 21:59:58 +01008955 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008956 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008957 if (PyUnicode_IS_ASCII(input)) {
8958 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8959 if (res < 0) {
8960 _PyUnicodeWriter_Dealloc(&writer);
8961 return NULL;
8962 }
8963 if (res == 1)
8964 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008965 }
Victor Stinner33798672016-03-01 21:59:58 +01008966 else {
8967 i = 0;
8968 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008971 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008972 int translate;
8973 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8974 Py_ssize_t newpos;
8975 /* startpos for collecting untranslatable chars */
8976 Py_ssize_t collstart;
8977 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008978 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979
Victor Stinner1194ea02014-04-04 19:37:40 +02008980 ch = PyUnicode_READ(kind, data, i);
8981 translate = charmaptranslate_output(ch, mapping, &writer);
8982 if (translate < 0)
8983 goto onError;
8984
8985 if (translate != 0) {
8986 /* it worked => adjust input pointer */
8987 ++i;
8988 continue;
8989 }
8990
8991 /* untranslatable character */
8992 collstart = i;
8993 collend = i+1;
8994
8995 /* find all untranslatable characters */
8996 while (collend < size) {
8997 PyObject *x;
8998 ch = PyUnicode_READ(kind, data, collend);
8999 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009000 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009001 Py_XDECREF(x);
9002 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009003 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009004 ++collend;
9005 }
9006
9007 if (ignore) {
9008 i = collend;
9009 }
9010 else {
9011 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9012 reason, input, &exc,
9013 collstart, collend, &newpos);
9014 if (repunicode == NULL)
9015 goto onError;
9016 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009017 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009018 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009019 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009020 Py_DECREF(repunicode);
9021 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009022 }
9023 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009024 Py_XDECREF(exc);
9025 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009026 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027
Benjamin Peterson29060642009-01-31 22:14:21 +00009028 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009029 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009030 Py_XDECREF(exc);
9031 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032 return NULL;
9033}
9034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035/* Deprecated. Use PyUnicode_Translate instead. */
9036PyObject *
9037PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9038 Py_ssize_t size,
9039 PyObject *mapping,
9040 const char *errors)
9041{
Christian Heimes5f520f42012-09-11 14:03:25 +02009042 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009043 PyObject *unicode = PyUnicode_FromUnicode(p, size);
9044 if (!unicode)
9045 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009046 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9047 Py_DECREF(unicode);
9048 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049}
9050
Alexander Belopolsky40018472011-02-26 01:02:56 +00009051PyObject *
9052PyUnicode_Translate(PyObject *str,
9053 PyObject *mapping,
9054 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009056 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009057 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009058 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059}
Tim Petersced69f82003-09-16 20:30:58 +00009060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009062fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009063{
9064 /* No need to call PyUnicode_READY(self) because this function is only
9065 called as a callback from fixup() which does it already. */
9066 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9067 const int kind = PyUnicode_KIND(self);
9068 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009069 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009070 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071 Py_ssize_t i;
9072
9073 for (i = 0; i < len; ++i) {
9074 ch = PyUnicode_READ(kind, data, i);
9075 fixed = 0;
9076 if (ch > 127) {
9077 if (Py_UNICODE_ISSPACE(ch))
9078 fixed = ' ';
9079 else {
9080 const int decimal = Py_UNICODE_TODECIMAL(ch);
9081 if (decimal >= 0)
9082 fixed = '0' + decimal;
9083 }
9084 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009085 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009086 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009087 PyUnicode_WRITE(kind, data, i, fixed);
9088 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009089 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009090 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092 }
9093
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009094 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095}
9096
9097PyObject *
9098_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9099{
9100 if (!PyUnicode_Check(unicode)) {
9101 PyErr_BadInternalCall();
9102 return NULL;
9103 }
9104 if (PyUnicode_READY(unicode) == -1)
9105 return NULL;
9106 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9107 /* If the string is already ASCII, just return the same string */
9108 Py_INCREF(unicode);
9109 return unicode;
9110 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009111 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112}
9113
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009114PyObject *
9115PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9116 Py_ssize_t length)
9117{
Victor Stinnerf0124502011-11-21 23:12:56 +01009118 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009119 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009120 Py_UCS4 maxchar;
9121 enum PyUnicode_Kind kind;
9122 void *data;
9123
Victor Stinner99d7ad02012-02-22 13:37:39 +01009124 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009125 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009126 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009127 if (ch > 127) {
9128 int decimal = Py_UNICODE_TODECIMAL(ch);
9129 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009130 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009131 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009132 }
9133 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009134
9135 /* Copy to a new string */
9136 decimal = PyUnicode_New(length, maxchar);
9137 if (decimal == NULL)
9138 return decimal;
9139 kind = PyUnicode_KIND(decimal);
9140 data = PyUnicode_DATA(decimal);
9141 /* Iterate over code points */
9142 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009143 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009144 if (ch > 127) {
9145 int decimal = Py_UNICODE_TODECIMAL(ch);
9146 if (decimal >= 0)
9147 ch = '0' + decimal;
9148 }
9149 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009151 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009152}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009153/* --- Decimal Encoder ---------------------------------------------------- */
9154
Alexander Belopolsky40018472011-02-26 01:02:56 +00009155int
9156PyUnicode_EncodeDecimal(Py_UNICODE *s,
9157 Py_ssize_t length,
9158 char *output,
9159 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009160{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009161 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009162 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009163 enum PyUnicode_Kind kind;
9164 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009165
9166 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 PyErr_BadArgument();
9168 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009169 }
9170
Victor Stinner42bf7752011-11-21 22:52:58 +01009171 unicode = PyUnicode_FromUnicode(s, length);
9172 if (unicode == NULL)
9173 return -1;
9174
Benjamin Petersonbac79492012-01-14 13:34:47 -05009175 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009176 Py_DECREF(unicode);
9177 return -1;
9178 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009179 kind = PyUnicode_KIND(unicode);
9180 data = PyUnicode_DATA(unicode);
9181
Victor Stinnerb84d7232011-11-22 01:50:07 +01009182 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009183 PyObject *exc;
9184 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009185 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009186 Py_ssize_t startpos;
9187
9188 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009189
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009191 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009192 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009193 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009194 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009195 decimal = Py_UNICODE_TODECIMAL(ch);
9196 if (decimal >= 0) {
9197 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009198 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 continue;
9200 }
9201 if (0 < ch && ch < 256) {
9202 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009203 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009204 continue;
9205 }
Victor Stinner6345be92011-11-25 20:09:01 +01009206
Victor Stinner42bf7752011-11-21 22:52:58 +01009207 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009208 exc = NULL;
9209 raise_encode_exception(&exc, "decimal", unicode,
9210 startpos, startpos+1,
9211 "invalid decimal Unicode string");
9212 Py_XDECREF(exc);
9213 Py_DECREF(unicode);
9214 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009215 }
9216 /* 0-terminate the output string */
9217 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009218 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009219 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009220}
9221
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222/* --- Helpers ------------------------------------------------------------ */
9223
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009224/* helper macro to fixup start/end slice values */
9225#define ADJUST_INDICES(start, end, len) \
9226 if (end > len) \
9227 end = len; \
9228 else if (end < 0) { \
9229 end += len; \
9230 if (end < 0) \
9231 end = 0; \
9232 } \
9233 if (start < 0) { \
9234 start += len; \
9235 if (start < 0) \
9236 start = 0; \
9237 }
9238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009240any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009242 Py_ssize_t end,
9243 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009245 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 void *buf1, *buf2;
9247 Py_ssize_t len1, len2, result;
9248
9249 kind1 = PyUnicode_KIND(s1);
9250 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009251 if (kind1 < kind2)
9252 return -1;
9253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254 len1 = PyUnicode_GET_LENGTH(s1);
9255 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009256 ADJUST_INDICES(start, end, len1);
9257 if (end - start < len2)
9258 return -1;
9259
9260 buf1 = PyUnicode_DATA(s1);
9261 buf2 = PyUnicode_DATA(s2);
9262 if (len2 == 1) {
9263 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9264 result = findchar((const char *)buf1 + kind1*start,
9265 kind1, end - start, ch, direction);
9266 if (result == -1)
9267 return -1;
9268 else
9269 return start + result;
9270 }
9271
9272 if (kind2 != kind1) {
9273 buf2 = _PyUnicode_AsKind(s2, kind1);
9274 if (!buf2)
9275 return -2;
9276 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277
Victor Stinner794d5672011-10-10 03:21:36 +02009278 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009279 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009280 case PyUnicode_1BYTE_KIND:
9281 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9282 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9283 else
9284 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9285 break;
9286 case PyUnicode_2BYTE_KIND:
9287 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9288 break;
9289 case PyUnicode_4BYTE_KIND:
9290 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9291 break;
9292 default:
9293 assert(0); result = -2;
9294 }
9295 }
9296 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009297 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009298 case PyUnicode_1BYTE_KIND:
9299 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9300 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9301 else
9302 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9303 break;
9304 case PyUnicode_2BYTE_KIND:
9305 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9306 break;
9307 case PyUnicode_4BYTE_KIND:
9308 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9309 break;
9310 default:
9311 assert(0); result = -2;
9312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 }
9314
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009315 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 PyMem_Free(buf2);
9317
9318 return result;
9319}
9320
9321Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009322_PyUnicode_InsertThousandsGrouping(
9323 PyObject *unicode, Py_ssize_t index,
9324 Py_ssize_t n_buffer,
9325 void *digits, Py_ssize_t n_digits,
9326 Py_ssize_t min_width,
9327 const char *grouping, PyObject *thousands_sep,
9328 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329{
Victor Stinner41a863c2012-02-24 00:37:51 +01009330 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009331 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009332 Py_ssize_t thousands_sep_len;
9333 Py_ssize_t len;
9334
9335 if (unicode != NULL) {
9336 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009337 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009338 }
9339 else {
9340 kind = PyUnicode_1BYTE_KIND;
9341 data = NULL;
9342 }
9343 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9344 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9345 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9346 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009347 if (thousands_sep_kind < kind) {
9348 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9349 if (!thousands_sep_data)
9350 return -1;
9351 }
9352 else {
9353 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9354 if (!data)
9355 return -1;
9356 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009357 }
9358
Benjamin Petersonead6b532011-12-20 17:23:42 -06009359 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009361 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009362 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009363 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009364 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009365 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009366 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009367 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009368 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009369 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009370 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009371 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009373 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009374 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009375 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009376 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009377 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009379 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009380 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009381 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009382 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009383 break;
9384 default:
9385 assert(0);
9386 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009388 if (unicode != NULL && thousands_sep_kind != kind) {
9389 if (thousands_sep_kind < kind)
9390 PyMem_Free(thousands_sep_data);
9391 else
9392 PyMem_Free(data);
9393 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009394 if (unicode == NULL) {
9395 *maxchar = 127;
9396 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009397 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009398 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009399 }
9400 }
9401 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402}
9403
9404
Alexander Belopolsky40018472011-02-26 01:02:56 +00009405Py_ssize_t
9406PyUnicode_Count(PyObject *str,
9407 PyObject *substr,
9408 Py_ssize_t start,
9409 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009411 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009412 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 void *buf1 = NULL, *buf2 = NULL;
9414 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009415
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009416 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009417 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009418
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009419 kind1 = PyUnicode_KIND(str);
9420 kind2 = PyUnicode_KIND(substr);
9421 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009422 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009423
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009424 len1 = PyUnicode_GET_LENGTH(str);
9425 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009426 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009427 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009428 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009429
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009430 buf1 = PyUnicode_DATA(str);
9431 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009432 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009433 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009434 if (!buf2)
9435 goto onError;
9436 }
9437
9438 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009440 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009441 result = asciilib_count(
9442 ((Py_UCS1*)buf1) + start, end - start,
9443 buf2, len2, PY_SSIZE_T_MAX
9444 );
9445 else
9446 result = ucs1lib_count(
9447 ((Py_UCS1*)buf1) + start, end - start,
9448 buf2, len2, PY_SSIZE_T_MAX
9449 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 break;
9451 case PyUnicode_2BYTE_KIND:
9452 result = ucs2lib_count(
9453 ((Py_UCS2*)buf1) + start, end - start,
9454 buf2, len2, PY_SSIZE_T_MAX
9455 );
9456 break;
9457 case PyUnicode_4BYTE_KIND:
9458 result = ucs4lib_count(
9459 ((Py_UCS4*)buf1) + start, end - start,
9460 buf2, len2, PY_SSIZE_T_MAX
9461 );
9462 break;
9463 default:
9464 assert(0); result = 0;
9465 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009466
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009467 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 PyMem_Free(buf2);
9469
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009472 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 PyMem_Free(buf2);
9474 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475}
9476
Alexander Belopolsky40018472011-02-26 01:02:56 +00009477Py_ssize_t
9478PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009479 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009480 Py_ssize_t start,
9481 Py_ssize_t end,
9482 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009484 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009485 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009486
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009487 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488}
9489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490Py_ssize_t
9491PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9492 Py_ssize_t start, Py_ssize_t end,
9493 int direction)
9494{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009496 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497 if (PyUnicode_READY(str) == -1)
9498 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009499 if (start < 0 || end < 0) {
9500 PyErr_SetString(PyExc_IndexError, "string index out of range");
9501 return -2;
9502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 if (end > PyUnicode_GET_LENGTH(str))
9504 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009505 if (start >= end)
9506 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009508 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9509 kind, end-start, ch, direction);
9510 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009512 else
9513 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514}
9515
Alexander Belopolsky40018472011-02-26 01:02:56 +00009516static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009517tailmatch(PyObject *self,
9518 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009519 Py_ssize_t start,
9520 Py_ssize_t end,
9521 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 int kind_self;
9524 int kind_sub;
9525 void *data_self;
9526 void *data_sub;
9527 Py_ssize_t offset;
9528 Py_ssize_t i;
9529 Py_ssize_t end_sub;
9530
9531 if (PyUnicode_READY(self) == -1 ||
9532 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009533 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9536 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009538 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009540 if (PyUnicode_GET_LENGTH(substring) == 0)
9541 return 1;
9542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 kind_self = PyUnicode_KIND(self);
9544 data_self = PyUnicode_DATA(self);
9545 kind_sub = PyUnicode_KIND(substring);
9546 data_sub = PyUnicode_DATA(substring);
9547 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9548
9549 if (direction > 0)
9550 offset = end;
9551 else
9552 offset = start;
9553
9554 if (PyUnicode_READ(kind_self, data_self, offset) ==
9555 PyUnicode_READ(kind_sub, data_sub, 0) &&
9556 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9557 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9558 /* If both are of the same kind, memcmp is sufficient */
9559 if (kind_self == kind_sub) {
9560 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009561 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 data_sub,
9563 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009564 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009566 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 else {
9568 /* We do not need to compare 0 and len(substring)-1 because
9569 the if statement above ensured already that they are equal
9570 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 for (i = 1; i < end_sub; ++i) {
9572 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9573 PyUnicode_READ(kind_sub, data_sub, i))
9574 return 0;
9575 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009576 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578 }
9579
9580 return 0;
9581}
9582
Alexander Belopolsky40018472011-02-26 01:02:56 +00009583Py_ssize_t
9584PyUnicode_Tailmatch(PyObject *str,
9585 PyObject *substr,
9586 Py_ssize_t start,
9587 Py_ssize_t end,
9588 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009590 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009591 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009592
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009593 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009594}
9595
Guido van Rossumd57fd912000-03-10 22:53:23 +00009596/* Apply fixfct filter to the Unicode object self and return a
9597 reference to the modified object */
9598
Alexander Belopolsky40018472011-02-26 01:02:56 +00009599static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009600fixup(PyObject *self,
9601 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009602{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 PyObject *u;
9604 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009605 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009606
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009607 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009609 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009610 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 /* fix functions return the new maximum character in a string,
9613 if the kind of the resulting unicode object does not change,
9614 everything is fine. Otherwise we need to change the string kind
9615 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009616 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009617
9618 if (maxchar_new == 0) {
9619 /* no changes */;
9620 if (PyUnicode_CheckExact(self)) {
9621 Py_DECREF(u);
9622 Py_INCREF(self);
9623 return self;
9624 }
9625 else
9626 return u;
9627 }
9628
Victor Stinnere6abb482012-05-02 01:15:40 +02009629 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630
Victor Stinnereaab6042011-12-11 22:22:39 +01009631 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009633
9634 /* In case the maximum character changed, we need to
9635 convert the string to the new category. */
9636 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9637 if (v == NULL) {
9638 Py_DECREF(u);
9639 return NULL;
9640 }
9641 if (maxchar_new > maxchar_old) {
9642 /* If the maxchar increased so that the kind changed, not all
9643 characters are representable anymore and we need to fix the
9644 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009645 _PyUnicode_FastCopyCharacters(v, 0,
9646 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009647 maxchar_old = fixfct(v);
9648 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 }
9650 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009651 _PyUnicode_FastCopyCharacters(v, 0,
9652 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009654 Py_DECREF(u);
9655 assert(_PyUnicode_CheckConsistency(v, 1));
9656 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657}
9658
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009659static PyObject *
9660ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9663 char *resdata, *data = PyUnicode_DATA(self);
9664 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009665
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009666 res = PyUnicode_New(len, 127);
9667 if (res == NULL)
9668 return NULL;
9669 resdata = PyUnicode_DATA(res);
9670 if (lower)
9671 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673 _Py_bytes_upper(resdata, data, len);
9674 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675}
9676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009678handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009680 Py_ssize_t j;
9681 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009682 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009683 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009684
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009685 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9686
9687 where ! is a negation and \p{xxx} is a character with property xxx.
9688 */
9689 for (j = i - 1; j >= 0; j--) {
9690 c = PyUnicode_READ(kind, data, j);
9691 if (!_PyUnicode_IsCaseIgnorable(c))
9692 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009694 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9695 if (final_sigma) {
9696 for (j = i + 1; j < length; j++) {
9697 c = PyUnicode_READ(kind, data, j);
9698 if (!_PyUnicode_IsCaseIgnorable(c))
9699 break;
9700 }
9701 final_sigma = j == length || !_PyUnicode_IsCased(c);
9702 }
9703 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704}
9705
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009706static int
9707lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9708 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009710 /* Obscure special case. */
9711 if (c == 0x3A3) {
9712 mapped[0] = handle_capital_sigma(kind, data, length, i);
9713 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009715 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716}
9717
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009718static Py_ssize_t
9719do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009720{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009721 Py_ssize_t i, k = 0;
9722 int n_res, j;
9723 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009724
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009725 c = PyUnicode_READ(kind, data, 0);
9726 n_res = _PyUnicode_ToUpperFull(c, mapped);
9727 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009728 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009729 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009731 for (i = 1; i < length; i++) {
9732 c = PyUnicode_READ(kind, data, i);
9733 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9734 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009735 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009736 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009737 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009738 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009739 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740}
9741
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009742static Py_ssize_t
9743do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9744 Py_ssize_t i, k = 0;
9745
9746 for (i = 0; i < length; i++) {
9747 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9748 int n_res, j;
9749 if (Py_UNICODE_ISUPPER(c)) {
9750 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9751 }
9752 else if (Py_UNICODE_ISLOWER(c)) {
9753 n_res = _PyUnicode_ToUpperFull(c, mapped);
9754 }
9755 else {
9756 n_res = 1;
9757 mapped[0] = c;
9758 }
9759 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009760 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009761 res[k++] = mapped[j];
9762 }
9763 }
9764 return k;
9765}
9766
9767static Py_ssize_t
9768do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9769 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009771 Py_ssize_t i, k = 0;
9772
9773 for (i = 0; i < length; i++) {
9774 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9775 int n_res, j;
9776 if (lower)
9777 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9778 else
9779 n_res = _PyUnicode_ToUpperFull(c, mapped);
9780 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009781 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009782 res[k++] = mapped[j];
9783 }
9784 }
9785 return k;
9786}
9787
9788static Py_ssize_t
9789do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9790{
9791 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9792}
9793
9794static Py_ssize_t
9795do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9796{
9797 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9798}
9799
Benjamin Petersone51757f2012-01-12 21:10:29 -05009800static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009801do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9802{
9803 Py_ssize_t i, k = 0;
9804
9805 for (i = 0; i < length; i++) {
9806 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9807 Py_UCS4 mapped[3];
9808 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9809 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009810 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009811 res[k++] = mapped[j];
9812 }
9813 }
9814 return k;
9815}
9816
9817static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009818do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9819{
9820 Py_ssize_t i, k = 0;
9821 int previous_is_cased;
9822
9823 previous_is_cased = 0;
9824 for (i = 0; i < length; i++) {
9825 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9826 Py_UCS4 mapped[3];
9827 int n_res, j;
9828
9829 if (previous_is_cased)
9830 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9831 else
9832 n_res = _PyUnicode_ToTitleFull(c, mapped);
9833
9834 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009835 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009836 res[k++] = mapped[j];
9837 }
9838
9839 previous_is_cased = _PyUnicode_IsCased(c);
9840 }
9841 return k;
9842}
9843
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009844static PyObject *
9845case_operation(PyObject *self,
9846 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9847{
9848 PyObject *res = NULL;
9849 Py_ssize_t length, newlength = 0;
9850 int kind, outkind;
9851 void *data, *outdata;
9852 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9853
Benjamin Petersoneea48462012-01-16 14:28:50 -05009854 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009855
9856 kind = PyUnicode_KIND(self);
9857 data = PyUnicode_DATA(self);
9858 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009859 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009860 PyErr_SetString(PyExc_OverflowError, "string is too long");
9861 return NULL;
9862 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009863 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009864 if (tmp == NULL)
9865 return PyErr_NoMemory();
9866 newlength = perform(kind, data, length, tmp, &maxchar);
9867 res = PyUnicode_New(newlength, maxchar);
9868 if (res == NULL)
9869 goto leave;
9870 tmpend = tmp + newlength;
9871 outdata = PyUnicode_DATA(res);
9872 outkind = PyUnicode_KIND(res);
9873 switch (outkind) {
9874 case PyUnicode_1BYTE_KIND:
9875 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9876 break;
9877 case PyUnicode_2BYTE_KIND:
9878 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9879 break;
9880 case PyUnicode_4BYTE_KIND:
9881 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9882 break;
9883 default:
9884 assert(0);
9885 break;
9886 }
9887 leave:
9888 PyMem_FREE(tmp);
9889 return res;
9890}
9891
Tim Peters8ce9f162004-08-27 01:49:32 +00009892PyObject *
9893PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009895 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009896 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009897 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009898 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009899 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9900 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009901 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009903 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009904 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009905 int use_memcpy;
9906 unsigned char *res_data = NULL, *sep_data = NULL;
9907 PyObject *last_obj;
9908 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009910 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009911 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009912 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009913 }
9914
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009915 /* NOTE: the following code can't call back into Python code,
9916 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009917 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009918
Tim Peters05eba1f2004-08-27 21:32:02 +00009919 seqlen = PySequence_Fast_GET_SIZE(fseq);
9920 /* If empty sequence, return u"". */
9921 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009922 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009923 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009924 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009925
Tim Peters05eba1f2004-08-27 21:32:02 +00009926 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009927 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009928 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009929 if (seqlen == 1) {
9930 if (PyUnicode_CheckExact(items[0])) {
9931 res = items[0];
9932 Py_INCREF(res);
9933 Py_DECREF(fseq);
9934 return res;
9935 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009936 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009937 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009938 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009939 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009940 /* Set up sep and seplen */
9941 if (separator == NULL) {
9942 /* fall back to a blank space separator */
9943 sep = PyUnicode_FromOrdinal(' ');
9944 if (!sep)
9945 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009946 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009947 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009948 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009949 else {
9950 if (!PyUnicode_Check(separator)) {
9951 PyErr_Format(PyExc_TypeError,
9952 "separator: expected str instance,"
9953 " %.80s found",
9954 Py_TYPE(separator)->tp_name);
9955 goto onError;
9956 }
9957 if (PyUnicode_READY(separator))
9958 goto onError;
9959 sep = separator;
9960 seplen = PyUnicode_GET_LENGTH(separator);
9961 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9962 /* inc refcount to keep this code path symmetric with the
9963 above case of a blank separator */
9964 Py_INCREF(sep);
9965 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009966 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009967 }
9968
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009969 /* There are at least two things to join, or else we have a subclass
9970 * of str in the sequence.
9971 * Do a pre-pass to figure out the total amount of space we'll
9972 * need (sz), and see whether all argument are strings.
9973 */
9974 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009975#ifdef Py_DEBUG
9976 use_memcpy = 0;
9977#else
9978 use_memcpy = 1;
9979#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009980 for (i = 0; i < seqlen; i++) {
9981 const Py_ssize_t old_sz = sz;
9982 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009983 if (!PyUnicode_Check(item)) {
9984 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009985 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009986 " %.80s found",
9987 i, Py_TYPE(item)->tp_name);
9988 goto onError;
9989 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990 if (PyUnicode_READY(item) == -1)
9991 goto onError;
9992 sz += PyUnicode_GET_LENGTH(item);
9993 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009994 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009995 if (i != 0)
9996 sz += seplen;
9997 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9998 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009999 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010000 goto onError;
10001 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010002 if (use_memcpy && last_obj != NULL) {
10003 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10004 use_memcpy = 0;
10005 }
10006 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010007 }
Tim Petersced69f82003-09-16 20:30:58 +000010008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010010 if (res == NULL)
10011 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010012
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010013 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010014#ifdef Py_DEBUG
10015 use_memcpy = 0;
10016#else
10017 if (use_memcpy) {
10018 res_data = PyUnicode_1BYTE_DATA(res);
10019 kind = PyUnicode_KIND(res);
10020 if (seplen != 0)
10021 sep_data = PyUnicode_1BYTE_DATA(sep);
10022 }
10023#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010024 if (use_memcpy) {
10025 for (i = 0; i < seqlen; ++i) {
10026 Py_ssize_t itemlen;
10027 item = items[i];
10028
10029 /* Copy item, and maybe the separator. */
10030 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010031 Py_MEMCPY(res_data,
10032 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010033 kind * seplen);
10034 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010035 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010036
10037 itemlen = PyUnicode_GET_LENGTH(item);
10038 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010039 Py_MEMCPY(res_data,
10040 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010041 kind * itemlen);
10042 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010043 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010044 }
10045 assert(res_data == PyUnicode_1BYTE_DATA(res)
10046 + kind * PyUnicode_GET_LENGTH(res));
10047 }
10048 else {
10049 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10050 Py_ssize_t itemlen;
10051 item = items[i];
10052
10053 /* Copy item, and maybe the separator. */
10054 if (i && seplen != 0) {
10055 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10056 res_offset += seplen;
10057 }
10058
10059 itemlen = PyUnicode_GET_LENGTH(item);
10060 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010061 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010062 res_offset += itemlen;
10063 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010064 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010065 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010066 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010067
Tim Peters05eba1f2004-08-27 21:32:02 +000010068 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010070 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010072
Benjamin Peterson29060642009-01-31 22:14:21 +000010073 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010074 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010076 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010077 return NULL;
10078}
10079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080#define FILL(kind, data, value, start, length) \
10081 do { \
10082 Py_ssize_t i_ = 0; \
10083 assert(kind != PyUnicode_WCHAR_KIND); \
10084 switch ((kind)) { \
10085 case PyUnicode_1BYTE_KIND: { \
10086 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010087 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 break; \
10089 } \
10090 case PyUnicode_2BYTE_KIND: { \
10091 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10092 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10093 break; \
10094 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010095 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10097 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10098 break; \
10099 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010100 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 } \
10102 } while (0)
10103
Victor Stinnerd3f08822012-05-29 12:57:52 +020010104void
10105_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10106 Py_UCS4 fill_char)
10107{
10108 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10109 const void *data = PyUnicode_DATA(unicode);
10110 assert(PyUnicode_IS_READY(unicode));
10111 assert(unicode_modifiable(unicode));
10112 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10113 assert(start >= 0);
10114 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10115 FILL(kind, data, fill_char, start, length);
10116}
10117
Victor Stinner3fe55312012-01-04 00:33:50 +010010118Py_ssize_t
10119PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10120 Py_UCS4 fill_char)
10121{
10122 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010123
10124 if (!PyUnicode_Check(unicode)) {
10125 PyErr_BadInternalCall();
10126 return -1;
10127 }
10128 if (PyUnicode_READY(unicode) == -1)
10129 return -1;
10130 if (unicode_check_modifiable(unicode))
10131 return -1;
10132
Victor Stinnerd3f08822012-05-29 12:57:52 +020010133 if (start < 0) {
10134 PyErr_SetString(PyExc_IndexError, "string index out of range");
10135 return -1;
10136 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010137 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10138 PyErr_SetString(PyExc_ValueError,
10139 "fill character is bigger than "
10140 "the string maximum character");
10141 return -1;
10142 }
10143
10144 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10145 length = Py_MIN(maxlen, length);
10146 if (length <= 0)
10147 return 0;
10148
Victor Stinnerd3f08822012-05-29 12:57:52 +020010149 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010150 return length;
10151}
10152
Victor Stinner9310abb2011-10-05 00:59:23 +020010153static PyObject *
10154pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010155 Py_ssize_t left,
10156 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 PyObject *u;
10160 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010161 int kind;
10162 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163
10164 if (left < 0)
10165 left = 0;
10166 if (right < 0)
10167 right = 0;
10168
Victor Stinnerc4b49542011-12-11 22:44:26 +010010169 if (left == 0 && right == 0)
10170 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10173 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010174 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10175 return NULL;
10176 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010178 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010180 if (!u)
10181 return NULL;
10182
10183 kind = PyUnicode_KIND(u);
10184 data = PyUnicode_DATA(u);
10185 if (left)
10186 FILL(kind, data, fill, 0, left);
10187 if (right)
10188 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010189 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010190 assert(_PyUnicode_CheckConsistency(u, 1));
10191 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010192}
10193
Alexander Belopolsky40018472011-02-26 01:02:56 +000010194PyObject *
10195PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010196{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010197 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010199 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010200 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201
Benjamin Petersonead6b532011-12-20 17:23:42 -060010202 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010204 if (PyUnicode_IS_ASCII(string))
10205 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010206 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010207 PyUnicode_GET_LENGTH(string), keepends);
10208 else
10209 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010210 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010211 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 break;
10213 case PyUnicode_2BYTE_KIND:
10214 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010215 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010216 PyUnicode_GET_LENGTH(string), keepends);
10217 break;
10218 case PyUnicode_4BYTE_KIND:
10219 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010220 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 PyUnicode_GET_LENGTH(string), keepends);
10222 break;
10223 default:
10224 assert(0);
10225 list = 0;
10226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228}
10229
Alexander Belopolsky40018472011-02-26 01:02:56 +000010230static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010231split(PyObject *self,
10232 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010233 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010235 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236 void *buf1, *buf2;
10237 Py_ssize_t len1, len2;
10238 PyObject* out;
10239
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010241 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 if (PyUnicode_READY(self) == -1)
10244 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010247 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010249 if (PyUnicode_IS_ASCII(self))
10250 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010251 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010252 PyUnicode_GET_LENGTH(self), maxcount
10253 );
10254 else
10255 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010256 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010257 PyUnicode_GET_LENGTH(self), maxcount
10258 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 case PyUnicode_2BYTE_KIND:
10260 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010261 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 PyUnicode_GET_LENGTH(self), maxcount
10263 );
10264 case PyUnicode_4BYTE_KIND:
10265 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010266 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 PyUnicode_GET_LENGTH(self), maxcount
10268 );
10269 default:
10270 assert(0);
10271 return NULL;
10272 }
10273
10274 if (PyUnicode_READY(substring) == -1)
10275 return NULL;
10276
10277 kind1 = PyUnicode_KIND(self);
10278 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 len1 = PyUnicode_GET_LENGTH(self);
10280 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010281 if (kind1 < kind2 || len1 < len2) {
10282 out = PyList_New(1);
10283 if (out == NULL)
10284 return NULL;
10285 Py_INCREF(self);
10286 PyList_SET_ITEM(out, 0, self);
10287 return out;
10288 }
10289 buf1 = PyUnicode_DATA(self);
10290 buf2 = PyUnicode_DATA(substring);
10291 if (kind2 != kind1) {
10292 buf2 = _PyUnicode_AsKind(substring, kind1);
10293 if (!buf2)
10294 return NULL;
10295 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010297 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010299 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10300 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010301 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010302 else
10303 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010304 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 break;
10306 case PyUnicode_2BYTE_KIND:
10307 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010308 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 break;
10310 case PyUnicode_4BYTE_KIND:
10311 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010312 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010313 break;
10314 default:
10315 out = NULL;
10316 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010317 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 PyMem_Free(buf2);
10319 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320}
10321
Alexander Belopolsky40018472011-02-26 01:02:56 +000010322static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010323rsplit(PyObject *self,
10324 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010325 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010326{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010327 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 void *buf1, *buf2;
10329 Py_ssize_t len1, len2;
10330 PyObject* out;
10331
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010332 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010333 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 if (PyUnicode_READY(self) == -1)
10336 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010339 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010341 if (PyUnicode_IS_ASCII(self))
10342 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010343 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010344 PyUnicode_GET_LENGTH(self), maxcount
10345 );
10346 else
10347 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010348 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010349 PyUnicode_GET_LENGTH(self), maxcount
10350 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 case PyUnicode_2BYTE_KIND:
10352 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010353 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 PyUnicode_GET_LENGTH(self), maxcount
10355 );
10356 case PyUnicode_4BYTE_KIND:
10357 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010358 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 PyUnicode_GET_LENGTH(self), maxcount
10360 );
10361 default:
10362 assert(0);
10363 return NULL;
10364 }
10365
10366 if (PyUnicode_READY(substring) == -1)
10367 return NULL;
10368
10369 kind1 = PyUnicode_KIND(self);
10370 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 len1 = PyUnicode_GET_LENGTH(self);
10372 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010373 if (kind1 < kind2 || len1 < len2) {
10374 out = PyList_New(1);
10375 if (out == NULL)
10376 return NULL;
10377 Py_INCREF(self);
10378 PyList_SET_ITEM(out, 0, self);
10379 return out;
10380 }
10381 buf1 = PyUnicode_DATA(self);
10382 buf2 = PyUnicode_DATA(substring);
10383 if (kind2 != kind1) {
10384 buf2 = _PyUnicode_AsKind(substring, kind1);
10385 if (!buf2)
10386 return NULL;
10387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010389 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010391 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10392 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010393 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010394 else
10395 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010396 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 break;
10398 case PyUnicode_2BYTE_KIND:
10399 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010400 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 break;
10402 case PyUnicode_4BYTE_KIND:
10403 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010404 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 break;
10406 default:
10407 out = NULL;
10408 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010409 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 PyMem_Free(buf2);
10411 return out;
10412}
10413
10414static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010415anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10416 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010418 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010420 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10421 return asciilib_find(buf1, len1, buf2, len2, offset);
10422 else
10423 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 case PyUnicode_2BYTE_KIND:
10425 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10426 case PyUnicode_4BYTE_KIND:
10427 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10428 }
10429 assert(0);
10430 return -1;
10431}
10432
10433static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010434anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10435 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010437 switch (kind) {
10438 case PyUnicode_1BYTE_KIND:
10439 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10440 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10441 else
10442 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10443 case PyUnicode_2BYTE_KIND:
10444 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10445 case PyUnicode_4BYTE_KIND:
10446 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10447 }
10448 assert(0);
10449 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010450}
10451
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010452static void
10453replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10454 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10455{
10456 int kind = PyUnicode_KIND(u);
10457 void *data = PyUnicode_DATA(u);
10458 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10459 if (kind == PyUnicode_1BYTE_KIND) {
10460 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10461 (Py_UCS1 *)data + len,
10462 u1, u2, maxcount);
10463 }
10464 else if (kind == PyUnicode_2BYTE_KIND) {
10465 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10466 (Py_UCS2 *)data + len,
10467 u1, u2, maxcount);
10468 }
10469 else {
10470 assert(kind == PyUnicode_4BYTE_KIND);
10471 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10472 (Py_UCS4 *)data + len,
10473 u1, u2, maxcount);
10474 }
10475}
10476
Alexander Belopolsky40018472011-02-26 01:02:56 +000010477static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478replace(PyObject *self, PyObject *str1,
10479 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 PyObject *u;
10482 char *sbuf = PyUnicode_DATA(self);
10483 char *buf1 = PyUnicode_DATA(str1);
10484 char *buf2 = PyUnicode_DATA(str2);
10485 int srelease = 0, release1 = 0, release2 = 0;
10486 int skind = PyUnicode_KIND(self);
10487 int kind1 = PyUnicode_KIND(str1);
10488 int kind2 = PyUnicode_KIND(str2);
10489 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10490 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10491 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010492 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010493 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010494
10495 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010496 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010498 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499
Victor Stinner59de0ee2011-10-07 10:01:28 +020010500 if (str1 == str2)
10501 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502
Victor Stinner49a0a212011-10-12 23:46:10 +020010503 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010504 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10505 if (maxchar < maxchar_str1)
10506 /* substring too wide to be present */
10507 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010508 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10509 /* Replacing str1 with str2 may cause a maxchar reduction in the
10510 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010511 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010512 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010515 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010517 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010519 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010520 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010521 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010522
Victor Stinner69ed0f42013-04-09 21:48:24 +020010523 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010524 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010525 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010526 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010527 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010529 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010531
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010532 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10533 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010534 }
10535 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 int rkind = skind;
10537 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010538 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 if (kind1 < rkind) {
10541 /* widen substring */
10542 buf1 = _PyUnicode_AsKind(str1, rkind);
10543 if (!buf1) goto error;
10544 release1 = 1;
10545 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010546 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010547 if (i < 0)
10548 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 if (rkind > kind2) {
10550 /* widen replacement */
10551 buf2 = _PyUnicode_AsKind(str2, rkind);
10552 if (!buf2) goto error;
10553 release2 = 1;
10554 }
10555 else if (rkind < kind2) {
10556 /* widen self and buf1 */
10557 rkind = kind2;
10558 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010559 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 sbuf = _PyUnicode_AsKind(self, rkind);
10561 if (!sbuf) goto error;
10562 srelease = 1;
10563 buf1 = _PyUnicode_AsKind(str1, rkind);
10564 if (!buf1) goto error;
10565 release1 = 1;
10566 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010567 u = PyUnicode_New(slen, maxchar);
10568 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010570 assert(PyUnicode_KIND(u) == rkind);
10571 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010572
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010573 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010574 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010575 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010577 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010579
10580 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010581 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010582 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010583 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010584 if (i == -1)
10585 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010586 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010588 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010592 }
10593 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010595 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 int rkind = skind;
10597 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010600 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 buf1 = _PyUnicode_AsKind(str1, rkind);
10602 if (!buf1) goto error;
10603 release1 = 1;
10604 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010605 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010606 if (n == 0)
10607 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010609 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 buf2 = _PyUnicode_AsKind(str2, rkind);
10611 if (!buf2) goto error;
10612 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010615 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 rkind = kind2;
10617 sbuf = _PyUnicode_AsKind(self, rkind);
10618 if (!sbuf) goto error;
10619 srelease = 1;
10620 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010621 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 buf1 = _PyUnicode_AsKind(str1, rkind);
10623 if (!buf1) goto error;
10624 release1 = 1;
10625 }
10626 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10627 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010628 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 PyErr_SetString(PyExc_OverflowError,
10630 "replace string is too long");
10631 goto error;
10632 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010633 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010634 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010635 _Py_INCREF_UNICODE_EMPTY();
10636 if (!unicode_empty)
10637 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010638 u = unicode_empty;
10639 goto done;
10640 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010641 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 PyErr_SetString(PyExc_OverflowError,
10643 "replace string is too long");
10644 goto error;
10645 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010646 u = PyUnicode_New(new_size, maxchar);
10647 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010649 assert(PyUnicode_KIND(u) == rkind);
10650 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 ires = i = 0;
10652 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010653 while (n-- > 0) {
10654 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010655 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010656 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010657 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010658 if (j == -1)
10659 break;
10660 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010661 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010662 memcpy(res + rkind * ires,
10663 sbuf + rkind * i,
10664 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010666 }
10667 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010669 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010671 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010677 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010678 memcpy(res + rkind * ires,
10679 sbuf + rkind * i,
10680 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010681 }
10682 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010683 /* interleave */
10684 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010685 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010687 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010689 if (--n <= 0)
10690 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010691 memcpy(res + rkind * ires,
10692 sbuf + rkind * i,
10693 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 ires++;
10695 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010696 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010697 memcpy(res + rkind * ires,
10698 sbuf + rkind * i,
10699 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010700 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010701 }
10702
10703 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010704 unicode_adjust_maxchar(&u);
10705 if (u == NULL)
10706 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010708
10709 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710 if (srelease)
10711 PyMem_FREE(sbuf);
10712 if (release1)
10713 PyMem_FREE(buf1);
10714 if (release2)
10715 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010716 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010718
Benjamin Peterson29060642009-01-31 22:14:21 +000010719 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010720 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 if (srelease)
10722 PyMem_FREE(sbuf);
10723 if (release1)
10724 PyMem_FREE(buf1);
10725 if (release2)
10726 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010727 return unicode_result_unchanged(self);
10728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 error:
10730 if (srelease && sbuf)
10731 PyMem_FREE(sbuf);
10732 if (release1 && buf1)
10733 PyMem_FREE(buf1);
10734 if (release2 && buf2)
10735 PyMem_FREE(buf2);
10736 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737}
10738
10739/* --- Unicode Object Methods --------------------------------------------- */
10740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010741PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010742 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743\n\
10744Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010745characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746
10747static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010748unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010750 if (PyUnicode_READY(self) == -1)
10751 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010752 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753}
10754
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010755PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010756 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757\n\
10758Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010759have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760
10761static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010762unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010764 if (PyUnicode_READY(self) == -1)
10765 return NULL;
10766 if (PyUnicode_GET_LENGTH(self) == 0)
10767 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010768 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769}
10770
Benjamin Petersond5890c82012-01-14 13:23:30 -050010771PyDoc_STRVAR(casefold__doc__,
10772 "S.casefold() -> str\n\
10773\n\
10774Return a version of S suitable for caseless comparisons.");
10775
10776static PyObject *
10777unicode_casefold(PyObject *self)
10778{
10779 if (PyUnicode_READY(self) == -1)
10780 return NULL;
10781 if (PyUnicode_IS_ASCII(self))
10782 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010783 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010784}
10785
10786
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010787/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010788
10789static int
10790convert_uc(PyObject *obj, void *addr)
10791{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010792 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010793
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010794 if (!PyUnicode_Check(obj)) {
10795 PyErr_Format(PyExc_TypeError,
10796 "The fill character must be a unicode character, "
10797 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010798 return 0;
10799 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010800 if (PyUnicode_READY(obj) < 0)
10801 return 0;
10802 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010803 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010804 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010805 return 0;
10806 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010807 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010808 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010809}
10810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010811PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010812 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010814Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010815done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816
10817static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010818unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010820 Py_ssize_t marg, left;
10821 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822 Py_UCS4 fillchar = ' ';
10823
Victor Stinnere9a29352011-10-01 02:14:59 +020010824 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826
Benjamin Petersonbac79492012-01-14 13:34:47 -050010827 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828 return NULL;
10829
Victor Stinnerc4b49542011-12-11 22:44:26 +010010830 if (PyUnicode_GET_LENGTH(self) >= width)
10831 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832
Victor Stinnerc4b49542011-12-11 22:44:26 +010010833 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834 left = marg / 2 + (marg & width & 1);
10835
Victor Stinner9310abb2011-10-05 00:59:23 +020010836 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837}
10838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839/* This function assumes that str1 and str2 are readied by the caller. */
10840
Marc-André Lemburge5034372000-08-08 08:04:29 +000010841static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010842unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010843{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010844#define COMPARE(TYPE1, TYPE2) \
10845 do { \
10846 TYPE1* p1 = (TYPE1 *)data1; \
10847 TYPE2* p2 = (TYPE2 *)data2; \
10848 TYPE1* end = p1 + len; \
10849 Py_UCS4 c1, c2; \
10850 for (; p1 != end; p1++, p2++) { \
10851 c1 = *p1; \
10852 c2 = *p2; \
10853 if (c1 != c2) \
10854 return (c1 < c2) ? -1 : 1; \
10855 } \
10856 } \
10857 while (0)
10858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 int kind1, kind2;
10860 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010861 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 kind1 = PyUnicode_KIND(str1);
10864 kind2 = PyUnicode_KIND(str2);
10865 data1 = PyUnicode_DATA(str1);
10866 data2 = PyUnicode_DATA(str2);
10867 len1 = PyUnicode_GET_LENGTH(str1);
10868 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010869 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010870
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010871 switch(kind1) {
10872 case PyUnicode_1BYTE_KIND:
10873 {
10874 switch(kind2) {
10875 case PyUnicode_1BYTE_KIND:
10876 {
10877 int cmp = memcmp(data1, data2, len);
10878 /* normalize result of memcmp() into the range [-1; 1] */
10879 if (cmp < 0)
10880 return -1;
10881 if (cmp > 0)
10882 return 1;
10883 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010884 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010885 case PyUnicode_2BYTE_KIND:
10886 COMPARE(Py_UCS1, Py_UCS2);
10887 break;
10888 case PyUnicode_4BYTE_KIND:
10889 COMPARE(Py_UCS1, Py_UCS4);
10890 break;
10891 default:
10892 assert(0);
10893 }
10894 break;
10895 }
10896 case PyUnicode_2BYTE_KIND:
10897 {
10898 switch(kind2) {
10899 case PyUnicode_1BYTE_KIND:
10900 COMPARE(Py_UCS2, Py_UCS1);
10901 break;
10902 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010903 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010904 COMPARE(Py_UCS2, Py_UCS2);
10905 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010906 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010907 case PyUnicode_4BYTE_KIND:
10908 COMPARE(Py_UCS2, Py_UCS4);
10909 break;
10910 default:
10911 assert(0);
10912 }
10913 break;
10914 }
10915 case PyUnicode_4BYTE_KIND:
10916 {
10917 switch(kind2) {
10918 case PyUnicode_1BYTE_KIND:
10919 COMPARE(Py_UCS4, Py_UCS1);
10920 break;
10921 case PyUnicode_2BYTE_KIND:
10922 COMPARE(Py_UCS4, Py_UCS2);
10923 break;
10924 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010925 {
10926#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10927 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10928 /* normalize result of wmemcmp() into the range [-1; 1] */
10929 if (cmp < 0)
10930 return -1;
10931 if (cmp > 0)
10932 return 1;
10933#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010934 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010935#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010936 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010937 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010938 default:
10939 assert(0);
10940 }
10941 break;
10942 }
10943 default:
10944 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010945 }
10946
Victor Stinner770e19e2012-10-04 22:59:45 +020010947 if (len1 == len2)
10948 return 0;
10949 if (len1 < len2)
10950 return -1;
10951 else
10952 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010953
10954#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010955}
10956
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010957Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010958unicode_compare_eq(PyObject *str1, PyObject *str2)
10959{
10960 int kind;
10961 void *data1, *data2;
10962 Py_ssize_t len;
10963 int cmp;
10964
Victor Stinnere5567ad2012-10-23 02:48:49 +020010965 len = PyUnicode_GET_LENGTH(str1);
10966 if (PyUnicode_GET_LENGTH(str2) != len)
10967 return 0;
10968 kind = PyUnicode_KIND(str1);
10969 if (PyUnicode_KIND(str2) != kind)
10970 return 0;
10971 data1 = PyUnicode_DATA(str1);
10972 data2 = PyUnicode_DATA(str2);
10973
10974 cmp = memcmp(data1, data2, len * kind);
10975 return (cmp == 0);
10976}
10977
10978
Alexander Belopolsky40018472011-02-26 01:02:56 +000010979int
10980PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10983 if (PyUnicode_READY(left) == -1 ||
10984 PyUnicode_READY(right) == -1)
10985 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010986
10987 /* a string is equal to itself */
10988 if (left == right)
10989 return 0;
10990
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010991 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010993 PyErr_Format(PyExc_TypeError,
10994 "Can't compare %.100s and %.100s",
10995 left->ob_type->tp_name,
10996 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997 return -1;
10998}
10999
Martin v. Löwis5b222132007-06-10 09:51:05 +000011000int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010011001_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
11002{
11003 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
11004 if (right_str == NULL)
11005 return -1;
11006 return PyUnicode_Compare(left, right_str);
11007}
11008
11009int
Martin v. Löwis5b222132007-06-10 09:51:05 +000011010PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11011{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011012 Py_ssize_t i;
11013 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 Py_UCS4 chr;
11015
Victor Stinner910337b2011-10-03 03:20:16 +020011016 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017 if (PyUnicode_READY(uni) == -1)
11018 return -1;
11019 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011020 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011021 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011022 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011023 size_t len, len2 = strlen(str);
11024 int cmp;
11025
11026 len = Py_MIN(len1, len2);
11027 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011028 if (cmp != 0) {
11029 if (cmp < 0)
11030 return -1;
11031 else
11032 return 1;
11033 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011034 if (len1 > len2)
11035 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011036 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011037 return -1; /* str is longer */
11038 return 0;
11039 }
11040 else {
11041 void *data = PyUnicode_DATA(uni);
11042 /* Compare Unicode string and source character set string */
11043 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011044 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011045 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11046 /* This check keeps Python strings that end in '\0' from comparing equal
11047 to C strings identical up to that point. */
11048 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11049 return 1; /* uni is longer */
11050 if (str[i])
11051 return -1; /* str is longer */
11052 return 0;
11053 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011054}
11055
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011056
Benjamin Peterson29060642009-01-31 22:14:21 +000011057#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011058 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011059
Alexander Belopolsky40018472011-02-26 01:02:56 +000011060PyObject *
11061PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011062{
11063 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011064 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011065
Victor Stinnere5567ad2012-10-23 02:48:49 +020011066 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11067 Py_RETURN_NOTIMPLEMENTED;
11068
11069 if (PyUnicode_READY(left) == -1 ||
11070 PyUnicode_READY(right) == -1)
11071 return NULL;
11072
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011073 if (left == right) {
11074 switch (op) {
11075 case Py_EQ:
11076 case Py_LE:
11077 case Py_GE:
11078 /* a string is equal to itself */
11079 v = Py_True;
11080 break;
11081 case Py_NE:
11082 case Py_LT:
11083 case Py_GT:
11084 v = Py_False;
11085 break;
11086 default:
11087 PyErr_BadArgument();
11088 return NULL;
11089 }
11090 }
11091 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011092 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011093 result ^= (op == Py_NE);
11094 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011095 }
11096 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011097 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011098
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011099 /* Convert the return value to a Boolean */
11100 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011101 case Py_LE:
11102 v = TEST_COND(result <= 0);
11103 break;
11104 case Py_GE:
11105 v = TEST_COND(result >= 0);
11106 break;
11107 case Py_LT:
11108 v = TEST_COND(result == -1);
11109 break;
11110 case Py_GT:
11111 v = TEST_COND(result == 1);
11112 break;
11113 default:
11114 PyErr_BadArgument();
11115 return NULL;
11116 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011117 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011118 Py_INCREF(v);
11119 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011120}
11121
Alexander Belopolsky40018472011-02-26 01:02:56 +000011122int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011123_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11124{
11125 return unicode_eq(aa, bb);
11126}
11127
11128int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011129PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011130{
Victor Stinner77282cb2013-04-14 19:22:47 +020011131 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 void *buf1, *buf2;
11133 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011134 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011135
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011136 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011137 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011138 "'in <string>' requires string as left operand, not %.100s",
11139 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011140 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011141 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011142 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011143 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011144 if (ensure_unicode(str) < 0)
11145 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011148 kind2 = PyUnicode_KIND(substr);
11149 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011150 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011152 len2 = PyUnicode_GET_LENGTH(substr);
11153 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011154 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011155 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011156 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011157 if (len2 == 1) {
11158 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11159 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011160 return result;
11161 }
11162 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011163 buf2 = _PyUnicode_AsKind(substr, kind1);
11164 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011165 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011167
Victor Stinner77282cb2013-04-14 19:22:47 +020011168 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 case PyUnicode_1BYTE_KIND:
11170 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11171 break;
11172 case PyUnicode_2BYTE_KIND:
11173 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11174 break;
11175 case PyUnicode_4BYTE_KIND:
11176 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11177 break;
11178 default:
11179 result = -1;
11180 assert(0);
11181 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011182
Victor Stinner77282cb2013-04-14 19:22:47 +020011183 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 PyMem_Free(buf2);
11185
Guido van Rossum403d68b2000-03-13 15:55:09 +000011186 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011187}
11188
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189/* Concat to string or Unicode object giving a new Unicode object. */
11190
Alexander Belopolsky40018472011-02-26 01:02:56 +000011191PyObject *
11192PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011194 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011195 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011196 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011198 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11199 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200
11201 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011202 if (left == unicode_empty)
11203 return PyUnicode_FromObject(right);
11204 if (right == unicode_empty)
11205 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011207 left_len = PyUnicode_GET_LENGTH(left);
11208 right_len = PyUnicode_GET_LENGTH(right);
11209 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011210 PyErr_SetString(PyExc_OverflowError,
11211 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011212 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011213 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011214 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011215
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011216 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11217 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011218 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011221 result = PyUnicode_New(new_len, maxchar);
11222 if (result == NULL)
11223 return NULL;
11224 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11225 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11226 assert(_PyUnicode_CheckConsistency(result, 1));
11227 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228}
11229
Walter Dörwald1ab83302007-05-18 17:15:44 +000011230void
Victor Stinner23e56682011-10-03 03:54:37 +020011231PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011232{
Victor Stinner23e56682011-10-03 03:54:37 +020011233 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011234 Py_UCS4 maxchar, maxchar2;
11235 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011236
11237 if (p_left == NULL) {
11238 if (!PyErr_Occurred())
11239 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011240 return;
11241 }
Victor Stinner23e56682011-10-03 03:54:37 +020011242 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011243 if (right == NULL || left == NULL
11244 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011245 if (!PyErr_Occurred())
11246 PyErr_BadInternalCall();
11247 goto error;
11248 }
11249
Benjamin Petersonbac79492012-01-14 13:34:47 -050011250 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011251 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011252 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011253 goto error;
11254
Victor Stinner488fa492011-12-12 00:01:39 +010011255 /* Shortcuts */
11256 if (left == unicode_empty) {
11257 Py_DECREF(left);
11258 Py_INCREF(right);
11259 *p_left = right;
11260 return;
11261 }
11262 if (right == unicode_empty)
11263 return;
11264
11265 left_len = PyUnicode_GET_LENGTH(left);
11266 right_len = PyUnicode_GET_LENGTH(right);
11267 if (left_len > PY_SSIZE_T_MAX - right_len) {
11268 PyErr_SetString(PyExc_OverflowError,
11269 "strings are too large to concat");
11270 goto error;
11271 }
11272 new_len = left_len + right_len;
11273
11274 if (unicode_modifiable(left)
11275 && PyUnicode_CheckExact(right)
11276 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011277 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11278 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011279 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011280 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011281 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11282 {
11283 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011284 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011285 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011286
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011287 /* copy 'right' into the newly allocated area of 'left' */
11288 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011289 }
Victor Stinner488fa492011-12-12 00:01:39 +010011290 else {
11291 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11292 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011293 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011294
Victor Stinner488fa492011-12-12 00:01:39 +010011295 /* Concat the two Unicode strings */
11296 res = PyUnicode_New(new_len, maxchar);
11297 if (res == NULL)
11298 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011299 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11300 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011301 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011302 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011303 }
11304 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011305 return;
11306
11307error:
Victor Stinner488fa492011-12-12 00:01:39 +010011308 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011309}
11310
11311void
11312PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11313{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011314 PyUnicode_Append(pleft, right);
11315 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011316}
11317
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011318/*
11319Wraps stringlib_parse_args_finds() and additionally ensures that the
11320first argument is a unicode object.
11321*/
11322
11323Py_LOCAL_INLINE(int)
11324parse_args_finds_unicode(const char * function_name, PyObject *args,
11325 PyObject **substring,
11326 Py_ssize_t *start, Py_ssize_t *end)
11327{
11328 if(stringlib_parse_args_finds(function_name, args, substring,
11329 start, end)) {
11330 if (ensure_unicode(*substring) < 0)
11331 return 0;
11332 return 1;
11333 }
11334 return 0;
11335}
11336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011337PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011338 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011340Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011341string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011342interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011343
11344static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011345unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011347 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011348 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011349 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011351 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011352 void *buf1, *buf2;
11353 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011355 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011356 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 kind1 = PyUnicode_KIND(self);
11359 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011360 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011361 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011363 len1 = PyUnicode_GET_LENGTH(self);
11364 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011366 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011367 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011368
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011369 buf1 = PyUnicode_DATA(self);
11370 buf2 = PyUnicode_DATA(substring);
11371 if (kind2 != kind1) {
11372 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011373 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011374 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011375 }
11376 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 case PyUnicode_1BYTE_KIND:
11378 iresult = ucs1lib_count(
11379 ((Py_UCS1*)buf1) + start, end - start,
11380 buf2, len2, PY_SSIZE_T_MAX
11381 );
11382 break;
11383 case PyUnicode_2BYTE_KIND:
11384 iresult = ucs2lib_count(
11385 ((Py_UCS2*)buf1) + start, end - start,
11386 buf2, len2, PY_SSIZE_T_MAX
11387 );
11388 break;
11389 case PyUnicode_4BYTE_KIND:
11390 iresult = ucs4lib_count(
11391 ((Py_UCS4*)buf1) + start, end - start,
11392 buf2, len2, PY_SSIZE_T_MAX
11393 );
11394 break;
11395 default:
11396 assert(0); iresult = 0;
11397 }
11398
11399 result = PyLong_FromSsize_t(iresult);
11400
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011401 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011402 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404 return result;
11405}
11406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011407PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011408 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011410Encode S using the codec registered for encoding. Default encoding\n\
11411is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011412handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011413a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11414'xmlcharrefreplace' as well as any other name registered with\n\
11415codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416
11417static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011418unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011420 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421 char *encoding = NULL;
11422 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011423
Benjamin Peterson308d6372009-09-18 21:42:35 +000011424 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11425 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011427 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011428}
11429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011430PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011431 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432\n\
11433Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011434If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435
11436static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011437unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011439 Py_ssize_t i, j, line_pos, src_len, incr;
11440 Py_UCS4 ch;
11441 PyObject *u;
11442 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011443 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011445 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011446 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447
Ezio Melotti745d54d2013-11-16 19:10:57 +020011448 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11449 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011450 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451
Antoine Pitrou22425222011-10-04 19:10:51 +020011452 if (PyUnicode_READY(self) == -1)
11453 return NULL;
11454
Thomas Wouters7e474022000-07-16 12:04:32 +000011455 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011456 src_len = PyUnicode_GET_LENGTH(self);
11457 i = j = line_pos = 0;
11458 kind = PyUnicode_KIND(self);
11459 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011460 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011461 for (; i < src_len; i++) {
11462 ch = PyUnicode_READ(kind, src_data, i);
11463 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011464 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011465 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011466 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011467 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011468 goto overflow;
11469 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011470 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011471 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011472 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011475 goto overflow;
11476 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011478 if (ch == '\n' || ch == '\r')
11479 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011481 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011482 if (!found)
11483 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011484
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011486 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487 if (!u)
11488 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011489 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490
Antoine Pitroue71d5742011-10-04 15:55:09 +020011491 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492
Antoine Pitroue71d5742011-10-04 15:55:09 +020011493 for (; i < src_len; i++) {
11494 ch = PyUnicode_READ(kind, src_data, i);
11495 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011496 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011497 incr = tabsize - (line_pos % tabsize);
11498 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011499 FILL(kind, dest_data, ' ', j, incr);
11500 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011501 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011502 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011504 line_pos++;
11505 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011506 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011507 if (ch == '\n' || ch == '\r')
11508 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011510 }
11511 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011512 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011513
Antoine Pitroue71d5742011-10-04 15:55:09 +020011514 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011515 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517}
11518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011519PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011520 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521\n\
11522Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011523such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524arguments start and end are interpreted as in slice notation.\n\
11525\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011526Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527
11528static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011531 /* initialize variables to prevent gcc warning */
11532 PyObject *substring = NULL;
11533 Py_ssize_t start = 0;
11534 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011535 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011537 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011540 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011541 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011543 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011545 if (result == -2)
11546 return NULL;
11547
Christian Heimes217cfd12007-12-02 14:31:20 +000011548 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549}
11550
11551static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011552unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011554 void *data;
11555 enum PyUnicode_Kind kind;
11556 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011557
11558 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11559 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011561 }
11562 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11563 PyErr_SetString(PyExc_IndexError, "string index out of range");
11564 return NULL;
11565 }
11566 kind = PyUnicode_KIND(self);
11567 data = PyUnicode_DATA(self);
11568 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011569 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570}
11571
Guido van Rossumc2504932007-09-18 19:42:40 +000011572/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011573 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011574static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011575unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576{
Guido van Rossumc2504932007-09-18 19:42:40 +000011577 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011578 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011579
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011580#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011581 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011582#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583 if (_PyUnicode_HASH(self) != -1)
11584 return _PyUnicode_HASH(self);
11585 if (PyUnicode_READY(self) == -1)
11586 return -1;
11587 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011588 /*
11589 We make the hash of the empty string be 0, rather than using
11590 (prefix ^ suffix), since this slightly obfuscates the hash secret
11591 */
11592 if (len == 0) {
11593 _PyUnicode_HASH(self) = 0;
11594 return 0;
11595 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011596 x = _Py_HashBytes(PyUnicode_DATA(self),
11597 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011598 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011599 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600}
11601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011602PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011603 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011605Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606
11607static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011610 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011611 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011612 PyObject *substring = NULL;
11613 Py_ssize_t start = 0;
11614 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011616 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011619 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011622 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624 if (result == -2)
11625 return NULL;
11626
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627 if (result < 0) {
11628 PyErr_SetString(PyExc_ValueError, "substring not found");
11629 return NULL;
11630 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011631
Christian Heimes217cfd12007-12-02 14:31:20 +000011632 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633}
11634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011635PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011636 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011638Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011639at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640
11641static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011642unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644 Py_ssize_t i, length;
11645 int kind;
11646 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647 int cased;
11648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649 if (PyUnicode_READY(self) == -1)
11650 return NULL;
11651 length = PyUnicode_GET_LENGTH(self);
11652 kind = PyUnicode_KIND(self);
11653 data = PyUnicode_DATA(self);
11654
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 if (length == 1)
11657 return PyBool_FromLong(
11658 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011660 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011662 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011663
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 for (i = 0; i < length; i++) {
11666 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011667
Benjamin Peterson29060642009-01-31 22:14:21 +000011668 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11669 return PyBool_FromLong(0);
11670 else if (!cased && Py_UNICODE_ISLOWER(ch))
11671 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011673 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674}
11675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011676PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011677 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011679Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011680at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681
11682static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011683unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 Py_ssize_t i, length;
11686 int kind;
11687 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688 int cased;
11689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 if (PyUnicode_READY(self) == -1)
11691 return NULL;
11692 length = PyUnicode_GET_LENGTH(self);
11693 kind = PyUnicode_KIND(self);
11694 data = PyUnicode_DATA(self);
11695
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 if (length == 1)
11698 return PyBool_FromLong(
11699 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011701 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011702 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011703 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011704
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 for (i = 0; i < length; i++) {
11707 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011708
Benjamin Peterson29060642009-01-31 22:14:21 +000011709 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11710 return PyBool_FromLong(0);
11711 else if (!cased && Py_UNICODE_ISUPPER(ch))
11712 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011714 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715}
11716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011717PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011718 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011720Return True if S is a titlecased string and there is at least one\n\
11721character in S, i.e. upper- and titlecase characters may only\n\
11722follow uncased characters and lowercase characters only cased ones.\n\
11723Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724
11725static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011726unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 Py_ssize_t i, length;
11729 int kind;
11730 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731 int cased, previous_is_cased;
11732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011733 if (PyUnicode_READY(self) == -1)
11734 return NULL;
11735 length = PyUnicode_GET_LENGTH(self);
11736 kind = PyUnicode_KIND(self);
11737 data = PyUnicode_DATA(self);
11738
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740 if (length == 1) {
11741 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11742 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11743 (Py_UNICODE_ISUPPER(ch) != 0));
11744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011746 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011748 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011749
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750 cased = 0;
11751 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 for (i = 0; i < length; i++) {
11753 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011754
Benjamin Peterson29060642009-01-31 22:14:21 +000011755 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11756 if (previous_is_cased)
11757 return PyBool_FromLong(0);
11758 previous_is_cased = 1;
11759 cased = 1;
11760 }
11761 else if (Py_UNICODE_ISLOWER(ch)) {
11762 if (!previous_is_cased)
11763 return PyBool_FromLong(0);
11764 previous_is_cased = 1;
11765 cased = 1;
11766 }
11767 else
11768 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011770 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771}
11772
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011773PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011776Return True if all characters in S are whitespace\n\
11777and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778
11779static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011780unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 Py_ssize_t i, length;
11783 int kind;
11784 void *data;
11785
11786 if (PyUnicode_READY(self) == -1)
11787 return NULL;
11788 length = PyUnicode_GET_LENGTH(self);
11789 kind = PyUnicode_KIND(self);
11790 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 if (length == 1)
11794 return PyBool_FromLong(
11795 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011797 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011798 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011799 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 for (i = 0; i < length; i++) {
11802 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011803 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011804 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011806 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807}
11808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011809PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011811\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011812Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011813and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011814
11815static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011816unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011817{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 Py_ssize_t i, length;
11819 int kind;
11820 void *data;
11821
11822 if (PyUnicode_READY(self) == -1)
11823 return NULL;
11824 length = PyUnicode_GET_LENGTH(self);
11825 kind = PyUnicode_KIND(self);
11826 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011827
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011828 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 if (length == 1)
11830 return PyBool_FromLong(
11831 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011832
11833 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011834 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011835 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011836
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 for (i = 0; i < length; i++) {
11838 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011839 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011840 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011841 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011842}
11843
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011844PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011845 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011846\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011847Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011848and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011849
11850static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011851unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011852{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 int kind;
11854 void *data;
11855 Py_ssize_t len, i;
11856
11857 if (PyUnicode_READY(self) == -1)
11858 return NULL;
11859
11860 kind = PyUnicode_KIND(self);
11861 data = PyUnicode_DATA(self);
11862 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011863
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011864 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 if (len == 1) {
11866 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11867 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11868 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011869
11870 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011872 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 for (i = 0; i < len; i++) {
11875 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011876 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011878 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011879 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011880}
11881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011882PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011885Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011886False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887
11888static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011889unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 Py_ssize_t i, length;
11892 int kind;
11893 void *data;
11894
11895 if (PyUnicode_READY(self) == -1)
11896 return NULL;
11897 length = PyUnicode_GET_LENGTH(self);
11898 kind = PyUnicode_KIND(self);
11899 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 if (length == 1)
11903 return PyBool_FromLong(
11904 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011906 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011907 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011908 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910 for (i = 0; i < length; i++) {
11911 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011912 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011914 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915}
11916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011917PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011918 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011920Return True if all characters in S are digits\n\
11921and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922
11923static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011924unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 Py_ssize_t i, length;
11927 int kind;
11928 void *data;
11929
11930 if (PyUnicode_READY(self) == -1)
11931 return NULL;
11932 length = PyUnicode_GET_LENGTH(self);
11933 kind = PyUnicode_KIND(self);
11934 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 if (length == 1) {
11938 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11939 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11940 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011942 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 for (i = 0; i < length; i++) {
11947 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011948 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011950 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951}
11952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011953PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011956Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011957False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958
11959static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011960unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 Py_ssize_t i, length;
11963 int kind;
11964 void *data;
11965
11966 if (PyUnicode_READY(self) == -1)
11967 return NULL;
11968 length = PyUnicode_GET_LENGTH(self);
11969 kind = PyUnicode_KIND(self);
11970 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 if (length == 1)
11974 return PyBool_FromLong(
11975 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011977 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011979 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 for (i = 0; i < length; i++) {
11982 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011983 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011985 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986}
11987
Martin v. Löwis47383402007-08-15 07:32:56 +000011988int
11989PyUnicode_IsIdentifier(PyObject *self)
11990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 int kind;
11992 void *data;
11993 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011994 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 if (PyUnicode_READY(self) == -1) {
11997 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011998 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 }
12000
12001 /* Special case for empty strings */
12002 if (PyUnicode_GET_LENGTH(self) == 0)
12003 return 0;
12004 kind = PyUnicode_KIND(self);
12005 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012006
12007 /* PEP 3131 says that the first character must be in
12008 XID_Start and subsequent characters in XID_Continue,
12009 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012010 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012011 letters, digits, underscore). However, given the current
12012 definition of XID_Start and XID_Continue, it is sufficient
12013 to check just for these, except that _ must be allowed
12014 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012016 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012017 return 0;
12018
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012019 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012022 return 1;
12023}
12024
12025PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012026 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000012027\n\
12028Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012029to the language definition.\n\
12030\n\
12031Use keyword.iskeyword() to test for reserved identifiers\n\
12032such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012033
12034static PyObject*
12035unicode_isidentifier(PyObject *self)
12036{
12037 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12038}
12039
Georg Brandl559e5d72008-06-11 18:37:52 +000012040PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012041 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012042\n\
12043Return True if all characters in S are considered\n\
12044printable in repr() or S is empty, False otherwise.");
12045
12046static PyObject*
12047unicode_isprintable(PyObject *self)
12048{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049 Py_ssize_t i, length;
12050 int kind;
12051 void *data;
12052
12053 if (PyUnicode_READY(self) == -1)
12054 return NULL;
12055 length = PyUnicode_GET_LENGTH(self);
12056 kind = PyUnicode_KIND(self);
12057 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012058
12059 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 if (length == 1)
12061 return PyBool_FromLong(
12062 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 for (i = 0; i < length; i++) {
12065 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012066 Py_RETURN_FALSE;
12067 }
12068 }
12069 Py_RETURN_TRUE;
12070}
12071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012072PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012073 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074\n\
12075Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012076iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077
12078static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012079unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012081 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082}
12083
Martin v. Löwis18e16552006-02-15 17:27:45 +000012084static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012085unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012087 if (PyUnicode_READY(self) == -1)
12088 return -1;
12089 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090}
12091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012092PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012093 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012095Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012096done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097
12098static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012099unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012101 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 Py_UCS4 fillchar = ' ';
12103
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012104 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105 return NULL;
12106
Benjamin Petersonbac79492012-01-14 13:34:47 -050012107 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012108 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109
Victor Stinnerc4b49542011-12-11 22:44:26 +010012110 if (PyUnicode_GET_LENGTH(self) >= width)
12111 return unicode_result_unchanged(self);
12112
12113 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114}
12115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012116PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012117 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012119Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120
12121static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012122unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012124 if (PyUnicode_READY(self) == -1)
12125 return NULL;
12126 if (PyUnicode_IS_ASCII(self))
12127 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012128 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129}
12130
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012131#define LEFTSTRIP 0
12132#define RIGHTSTRIP 1
12133#define BOTHSTRIP 2
12134
12135/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012136static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012137
12138#define STRIPNAME(i) (stripformat[i]+3)
12139
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012140/* externally visible for str.strip(unicode) */
12141PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012142_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012143{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 void *data;
12145 int kind;
12146 Py_ssize_t i, j, len;
12147 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012148 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12151 return NULL;
12152
12153 kind = PyUnicode_KIND(self);
12154 data = PyUnicode_DATA(self);
12155 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012156 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012157 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12158 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012159 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012160
Benjamin Peterson14339b62009-01-31 16:36:08 +000012161 i = 0;
12162 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012163 while (i < len) {
12164 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12165 if (!BLOOM(sepmask, ch))
12166 break;
12167 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12168 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012169 i++;
12170 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012171 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012172
Benjamin Peterson14339b62009-01-31 16:36:08 +000012173 j = len;
12174 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012175 j--;
12176 while (j >= i) {
12177 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12178 if (!BLOOM(sepmask, ch))
12179 break;
12180 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12181 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012182 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012183 }
12184
Benjamin Peterson29060642009-01-31 22:14:21 +000012185 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012186 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012187
Victor Stinner7931d9a2011-11-04 00:22:48 +010012188 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189}
12190
12191PyObject*
12192PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12193{
12194 unsigned char *data;
12195 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012196 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197
Victor Stinnerde636f32011-10-01 03:55:54 +020012198 if (PyUnicode_READY(self) == -1)
12199 return NULL;
12200
Victor Stinner684d5fd2012-05-03 02:32:34 +020012201 length = PyUnicode_GET_LENGTH(self);
12202 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012203
Victor Stinner684d5fd2012-05-03 02:32:34 +020012204 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012205 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012206
Victor Stinnerde636f32011-10-01 03:55:54 +020012207 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012208 PyErr_SetString(PyExc_IndexError, "string index out of range");
12209 return NULL;
12210 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012211 if (start >= length || end < start)
12212 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012213
Victor Stinner684d5fd2012-05-03 02:32:34 +020012214 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012215 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012216 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012217 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012218 }
12219 else {
12220 kind = PyUnicode_KIND(self);
12221 data = PyUnicode_1BYTE_DATA(self);
12222 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012223 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012224 length);
12225 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227
12228static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012229do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 Py_ssize_t len, i, j;
12232
12233 if (PyUnicode_READY(self) == -1)
12234 return NULL;
12235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012237
Victor Stinnercc7af722013-04-09 22:39:24 +020012238 if (PyUnicode_IS_ASCII(self)) {
12239 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12240
12241 i = 0;
12242 if (striptype != RIGHTSTRIP) {
12243 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012244 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012245 if (!_Py_ascii_whitespace[ch])
12246 break;
12247 i++;
12248 }
12249 }
12250
12251 j = len;
12252 if (striptype != LEFTSTRIP) {
12253 j--;
12254 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012255 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012256 if (!_Py_ascii_whitespace[ch])
12257 break;
12258 j--;
12259 }
12260 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012261 }
12262 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012263 else {
12264 int kind = PyUnicode_KIND(self);
12265 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012266
Victor Stinnercc7af722013-04-09 22:39:24 +020012267 i = 0;
12268 if (striptype != RIGHTSTRIP) {
12269 while (i < len) {
12270 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12271 if (!Py_UNICODE_ISSPACE(ch))
12272 break;
12273 i++;
12274 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012275 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012276
12277 j = len;
12278 if (striptype != LEFTSTRIP) {
12279 j--;
12280 while (j >= i) {
12281 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12282 if (!Py_UNICODE_ISSPACE(ch))
12283 break;
12284 j--;
12285 }
12286 j++;
12287 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012288 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012289
Victor Stinner7931d9a2011-11-04 00:22:48 +010012290 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291}
12292
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012293
12294static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012295do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012296{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012297 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012298
Serhiy Storchakac6792272013-10-19 21:03:34 +030012299 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012300 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012301
Benjamin Peterson14339b62009-01-31 16:36:08 +000012302 if (sep != NULL && sep != Py_None) {
12303 if (PyUnicode_Check(sep))
12304 return _PyUnicode_XStrip(self, striptype, sep);
12305 else {
12306 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012307 "%s arg must be None or str",
12308 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012309 return NULL;
12310 }
12311 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012312
Benjamin Peterson14339b62009-01-31 16:36:08 +000012313 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012314}
12315
12316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012317PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012318 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012319\n\
12320Return a copy of the string S with leading and trailing\n\
12321whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012322If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012323
12324static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012325unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012326{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012327 if (PyTuple_GET_SIZE(args) == 0)
12328 return do_strip(self, BOTHSTRIP); /* Common case */
12329 else
12330 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012331}
12332
12333
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012334PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012335 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012336\n\
12337Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012338If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012339
12340static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012341unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012342{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012343 if (PyTuple_GET_SIZE(args) == 0)
12344 return do_strip(self, LEFTSTRIP); /* Common case */
12345 else
12346 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012347}
12348
12349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012350PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012351 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012352\n\
12353Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012354If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012355
12356static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012357unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012358{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012359 if (PyTuple_GET_SIZE(args) == 0)
12360 return do_strip(self, RIGHTSTRIP); /* Common case */
12361 else
12362 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012363}
12364
12365
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012367unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012369 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012371
Serhiy Storchaka05997252013-01-26 12:14:02 +020012372 if (len < 1)
12373 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012374
Victor Stinnerc4b49542011-12-11 22:44:26 +010012375 /* no repeat, return original string */
12376 if (len == 1)
12377 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012378
Benjamin Petersonbac79492012-01-14 13:34:47 -050012379 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380 return NULL;
12381
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012382 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012383 PyErr_SetString(PyExc_OverflowError,
12384 "repeated string is too long");
12385 return NULL;
12386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012388
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012389 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012390 if (!u)
12391 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012392 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012394 if (PyUnicode_GET_LENGTH(str) == 1) {
12395 const int kind = PyUnicode_KIND(str);
12396 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012397 if (kind == PyUnicode_1BYTE_KIND) {
12398 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012399 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012400 }
12401 else if (kind == PyUnicode_2BYTE_KIND) {
12402 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012403 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012404 ucs2[n] = fill_char;
12405 } else {
12406 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12407 assert(kind == PyUnicode_4BYTE_KIND);
12408 for (n = 0; n < len; ++n)
12409 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 }
12412 else {
12413 /* number of characters copied this far */
12414 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012415 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012416 char *to = (char *) PyUnicode_DATA(u);
12417 Py_MEMCPY(to, PyUnicode_DATA(str),
12418 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012419 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012420 n = (done <= nchars-done) ? done : nchars-done;
12421 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012422 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012424 }
12425
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012426 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012427 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012428}
12429
Alexander Belopolsky40018472011-02-26 01:02:56 +000012430PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012431PyUnicode_Replace(PyObject *str,
12432 PyObject *substr,
12433 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012434 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012435{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012436 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12437 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012438 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012439 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012440}
12441
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012442PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012443 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444\n\
12445Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012446old replaced by new. If the optional argument count is\n\
12447given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012448
12449static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012450unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012452 PyObject *str1;
12453 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012454 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012455
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012456 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012458 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012459 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012460 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461}
12462
Alexander Belopolsky40018472011-02-26 01:02:56 +000012463static PyObject *
12464unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012465{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012466 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 Py_ssize_t isize;
12468 Py_ssize_t osize, squote, dquote, i, o;
12469 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012470 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012471 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012474 return NULL;
12475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 isize = PyUnicode_GET_LENGTH(unicode);
12477 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 /* Compute length of output, quote characters, and
12480 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012481 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 max = 127;
12483 squote = dquote = 0;
12484 ikind = PyUnicode_KIND(unicode);
12485 for (i = 0; i < isize; i++) {
12486 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012487 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012488 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012489 case '\'': squote++; break;
12490 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012491 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012492 incr = 2;
12493 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 default:
12495 /* Fast-path ASCII */
12496 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012497 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012499 ;
12500 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012501 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012502 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012503 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012504 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012505 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012506 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012507 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012508 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012509 if (osize > PY_SSIZE_T_MAX - incr) {
12510 PyErr_SetString(PyExc_OverflowError,
12511 "string is too long to generate repr");
12512 return NULL;
12513 }
12514 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 }
12516
12517 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012518 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012520 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521 if (dquote)
12522 /* Both squote and dquote present. Use squote,
12523 and escape them */
12524 osize += squote;
12525 else
12526 quote = '"';
12527 }
Victor Stinner55c08782013-04-14 18:45:39 +020012528 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529
12530 repr = PyUnicode_New(osize, max);
12531 if (repr == NULL)
12532 return NULL;
12533 okind = PyUnicode_KIND(repr);
12534 odata = PyUnicode_DATA(repr);
12535
12536 PyUnicode_WRITE(okind, odata, 0, quote);
12537 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012538 if (unchanged) {
12539 _PyUnicode_FastCopyCharacters(repr, 1,
12540 unicode, 0,
12541 isize);
12542 }
12543 else {
12544 for (i = 0, o = 1; i < isize; i++) {
12545 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546
Victor Stinner55c08782013-04-14 18:45:39 +020012547 /* Escape quotes and backslashes */
12548 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012549 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012551 continue;
12552 }
12553
12554 /* Map special whitespace to '\t', \n', '\r' */
12555 if (ch == '\t') {
12556 PyUnicode_WRITE(okind, odata, o++, '\\');
12557 PyUnicode_WRITE(okind, odata, o++, 't');
12558 }
12559 else if (ch == '\n') {
12560 PyUnicode_WRITE(okind, odata, o++, '\\');
12561 PyUnicode_WRITE(okind, odata, o++, 'n');
12562 }
12563 else if (ch == '\r') {
12564 PyUnicode_WRITE(okind, odata, o++, '\\');
12565 PyUnicode_WRITE(okind, odata, o++, 'r');
12566 }
12567
12568 /* Map non-printable US ASCII to '\xhh' */
12569 else if (ch < ' ' || ch == 0x7F) {
12570 PyUnicode_WRITE(okind, odata, o++, '\\');
12571 PyUnicode_WRITE(okind, odata, o++, 'x');
12572 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12573 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12574 }
12575
12576 /* Copy ASCII characters as-is */
12577 else if (ch < 0x7F) {
12578 PyUnicode_WRITE(okind, odata, o++, ch);
12579 }
12580
12581 /* Non-ASCII characters */
12582 else {
12583 /* Map Unicode whitespace and control characters
12584 (categories Z* and C* except ASCII space)
12585 */
12586 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12587 PyUnicode_WRITE(okind, odata, o++, '\\');
12588 /* Map 8-bit characters to '\xhh' */
12589 if (ch <= 0xff) {
12590 PyUnicode_WRITE(okind, odata, o++, 'x');
12591 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12592 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12593 }
12594 /* Map 16-bit characters to '\uxxxx' */
12595 else if (ch <= 0xffff) {
12596 PyUnicode_WRITE(okind, odata, o++, 'u');
12597 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12598 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12599 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12600 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12601 }
12602 /* Map 21-bit characters to '\U00xxxxxx' */
12603 else {
12604 PyUnicode_WRITE(okind, odata, o++, 'U');
12605 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12606 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12607 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12608 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12609 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12610 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12611 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12612 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12613 }
12614 }
12615 /* Copy characters as-is */
12616 else {
12617 PyUnicode_WRITE(okind, odata, o++, ch);
12618 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012619 }
12620 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012621 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012623 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012624 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625}
12626
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012627PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012628 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629\n\
12630Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012631such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632arguments start and end are interpreted as in slice notation.\n\
12633\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012634Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635
12636static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012639 /* initialize variables to prevent gcc warning */
12640 PyObject *substring = NULL;
12641 Py_ssize_t start = 0;
12642 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012643 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012645 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012647
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012648 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012651 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 if (result == -2)
12654 return NULL;
12655
Christian Heimes217cfd12007-12-02 14:31:20 +000012656 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657}
12658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012659PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012662Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663
12664static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012667 /* initialize variables to prevent gcc warning */
12668 PyObject *substring = NULL;
12669 Py_ssize_t start = 0;
12670 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012671 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012673 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012674 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012676 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012679 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 if (result == -2)
12682 return NULL;
12683
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684 if (result < 0) {
12685 PyErr_SetString(PyExc_ValueError, "substring not found");
12686 return NULL;
12687 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688
Christian Heimes217cfd12007-12-02 14:31:20 +000012689 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690}
12691
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012692PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012693 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012695Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012696done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697
12698static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012699unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012701 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012702 Py_UCS4 fillchar = ' ';
12703
Victor Stinnere9a29352011-10-01 02:14:59 +020012704 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012706
Benjamin Petersonbac79492012-01-14 13:34:47 -050012707 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012708 return NULL;
12709
Victor Stinnerc4b49542011-12-11 22:44:26 +010012710 if (PyUnicode_GET_LENGTH(self) >= width)
12711 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012712
Victor Stinnerc4b49542011-12-11 22:44:26 +010012713 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714}
12715
Alexander Belopolsky40018472011-02-26 01:02:56 +000012716PyObject *
12717PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012718{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012719 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012720 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012721
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012722 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723}
12724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012725PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012726 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012727\n\
12728Return a list of the words in S, using sep as the\n\
12729delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012730splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012731whitespace string is a separator and empty strings are\n\
12732removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733
12734static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012735unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012736{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012737 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012738 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012739 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012741 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12742 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743 return NULL;
12744
12745 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012746 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012747
12748 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012749 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012750
12751 PyErr_Format(PyExc_TypeError,
12752 "must be str or None, not %.100s",
12753 Py_TYPE(substring)->tp_name);
12754 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012755}
12756
Thomas Wouters477c8d52006-05-27 19:21:47 +000012757PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012758PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012759{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012760 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012761 int kind1, kind2;
12762 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012763 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012764
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012765 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012766 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012767
Victor Stinner14f8f022011-10-05 20:58:25 +020012768 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012770 len1 = PyUnicode_GET_LENGTH(str_obj);
12771 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012772 if (kind1 < kind2 || len1 < len2) {
12773 _Py_INCREF_UNICODE_EMPTY();
12774 if (!unicode_empty)
12775 out = NULL;
12776 else {
12777 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12778 Py_DECREF(unicode_empty);
12779 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012780 return out;
12781 }
12782 buf1 = PyUnicode_DATA(str_obj);
12783 buf2 = PyUnicode_DATA(sep_obj);
12784 if (kind2 != kind1) {
12785 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12786 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012787 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012788 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012789
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012790 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012791 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012792 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12793 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12794 else
12795 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796 break;
12797 case PyUnicode_2BYTE_KIND:
12798 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12799 break;
12800 case PyUnicode_4BYTE_KIND:
12801 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12802 break;
12803 default:
12804 assert(0);
12805 out = 0;
12806 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012807
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012808 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012809 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012810
12811 return out;
12812}
12813
12814
12815PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012816PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012817{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012818 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012819 int kind1, kind2;
12820 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012822
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012823 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012824 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012825
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012826 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012828 len1 = PyUnicode_GET_LENGTH(str_obj);
12829 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012830 if (kind1 < kind2 || len1 < len2) {
12831 _Py_INCREF_UNICODE_EMPTY();
12832 if (!unicode_empty)
12833 out = NULL;
12834 else {
12835 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12836 Py_DECREF(unicode_empty);
12837 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012838 return out;
12839 }
12840 buf1 = PyUnicode_DATA(str_obj);
12841 buf2 = PyUnicode_DATA(sep_obj);
12842 if (kind2 != kind1) {
12843 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12844 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012845 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012846 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012847
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012848 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012849 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012850 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12851 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12852 else
12853 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012854 break;
12855 case PyUnicode_2BYTE_KIND:
12856 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12857 break;
12858 case PyUnicode_4BYTE_KIND:
12859 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12860 break;
12861 default:
12862 assert(0);
12863 out = 0;
12864 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012865
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012866 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012867 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012868
12869 return out;
12870}
12871
12872PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012873 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012874\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012875Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012876the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012877found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012878
12879static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012880unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012881{
Victor Stinner9310abb2011-10-05 00:59:23 +020012882 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012883}
12884
12885PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012886 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012887\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012888Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012889the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012890separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012891
12892static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012893unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012894{
Victor Stinner9310abb2011-10-05 00:59:23 +020012895 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012896}
12897
Alexander Belopolsky40018472011-02-26 01:02:56 +000012898PyObject *
12899PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012900{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012901 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012902 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012903
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012904 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012905}
12906
12907PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012908 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012909\n\
12910Return a list of the words in S, using sep as the\n\
12911delimiter string, starting at the end of the string and\n\
12912working to the front. If maxsplit is given, at most maxsplit\n\
12913splits are done. If sep is not specified, any whitespace string\n\
12914is a separator.");
12915
12916static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012917unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012918{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012919 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012920 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012921 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012922
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012923 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12924 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012925 return NULL;
12926
12927 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012928 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012929
12930 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012931 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012932
12933 PyErr_Format(PyExc_TypeError,
12934 "must be str or None, not %.100s",
12935 Py_TYPE(substring)->tp_name);
12936 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012937}
12938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012939PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012940 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012941\n\
12942Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012943Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012944is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012945
12946static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012947unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012949 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012950 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012951
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012952 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12953 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954 return NULL;
12955
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012956 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012957}
12958
12959static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012960PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012962 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012963}
12964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012965PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012966 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967\n\
12968Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012969and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012970
12971static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012972unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012973{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012974 if (PyUnicode_READY(self) == -1)
12975 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012976 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012977}
12978
Larry Hastings61272b72014-01-07 12:41:53 -080012979/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012980
Larry Hastings31826802013-10-19 00:09:25 -070012981@staticmethod
12982str.maketrans as unicode_maketrans
12983
12984 x: object
12985
12986 y: unicode=NULL
12987
12988 z: unicode=NULL
12989
12990 /
12991
12992Return a translation table usable for str.translate().
12993
12994If there is only one argument, it must be a dictionary mapping Unicode
12995ordinals (integers) or characters to Unicode ordinals, strings or None.
12996Character keys will be then converted to ordinals.
12997If there are two arguments, they must be strings of equal length, and
12998in the resulting dictionary, each character in x will be mapped to the
12999character at the same position in y. If there is a third argument, it
13000must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013001[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013002
Larry Hastings31826802013-10-19 00:09:25 -070013003static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013004unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013005/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013006{
Georg Brandlceee0772007-11-27 23:48:05 +000013007 PyObject *new = NULL, *key, *value;
13008 Py_ssize_t i = 0;
13009 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013010
Georg Brandlceee0772007-11-27 23:48:05 +000013011 new = PyDict_New();
13012 if (!new)
13013 return NULL;
13014 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013015 int x_kind, y_kind, z_kind;
13016 void *x_data, *y_data, *z_data;
13017
Georg Brandlceee0772007-11-27 23:48:05 +000013018 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013019 if (!PyUnicode_Check(x)) {
13020 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13021 "be a string if there is a second argument");
13022 goto err;
13023 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013024 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013025 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13026 "arguments must have equal length");
13027 goto err;
13028 }
13029 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013030 x_kind = PyUnicode_KIND(x);
13031 y_kind = PyUnicode_KIND(y);
13032 x_data = PyUnicode_DATA(x);
13033 y_data = PyUnicode_DATA(y);
13034 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13035 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013036 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013037 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013038 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013039 if (!value) {
13040 Py_DECREF(key);
13041 goto err;
13042 }
Georg Brandlceee0772007-11-27 23:48:05 +000013043 res = PyDict_SetItem(new, key, value);
13044 Py_DECREF(key);
13045 Py_DECREF(value);
13046 if (res < 0)
13047 goto err;
13048 }
13049 /* create entries for deleting chars in z */
13050 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013051 z_kind = PyUnicode_KIND(z);
13052 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013053 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013054 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013055 if (!key)
13056 goto err;
13057 res = PyDict_SetItem(new, key, Py_None);
13058 Py_DECREF(key);
13059 if (res < 0)
13060 goto err;
13061 }
13062 }
13063 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013064 int kind;
13065 void *data;
13066
Georg Brandlceee0772007-11-27 23:48:05 +000013067 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013068 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013069 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13070 "to maketrans it must be a dict");
13071 goto err;
13072 }
13073 /* copy entries into the new dict, converting string keys to int keys */
13074 while (PyDict_Next(x, &i, &key, &value)) {
13075 if (PyUnicode_Check(key)) {
13076 /* convert string keys to integer keys */
13077 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013078 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013079 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13080 "table must be of length 1");
13081 goto err;
13082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013083 kind = PyUnicode_KIND(key);
13084 data = PyUnicode_DATA(key);
13085 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013086 if (!newkey)
13087 goto err;
13088 res = PyDict_SetItem(new, newkey, value);
13089 Py_DECREF(newkey);
13090 if (res < 0)
13091 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013092 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013093 /* just keep integer keys */
13094 if (PyDict_SetItem(new, key, value) < 0)
13095 goto err;
13096 } else {
13097 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13098 "be strings or integers");
13099 goto err;
13100 }
13101 }
13102 }
13103 return new;
13104 err:
13105 Py_DECREF(new);
13106 return NULL;
13107}
13108
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013109PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013110 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013112Return a copy of the string S in which each character has been mapped\n\
13113through the given translation table. The table must implement\n\
13114lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13115mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13116this operation raises LookupError, the character is left untouched.\n\
13117Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118
13119static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013120unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013122 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123}
13124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013125PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013126 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013128Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129
13130static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013131unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013133 if (PyUnicode_READY(self) == -1)
13134 return NULL;
13135 if (PyUnicode_IS_ASCII(self))
13136 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013137 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138}
13139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013140PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013141 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013143Pad a numeric string S with zeros on the left, to fill a field\n\
13144of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145
13146static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013147unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013148{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013149 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013150 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013151 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013152 int kind;
13153 void *data;
13154 Py_UCS4 chr;
13155
Martin v. Löwis18e16552006-02-15 17:27:45 +000013156 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157 return NULL;
13158
Benjamin Petersonbac79492012-01-14 13:34:47 -050013159 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013160 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013161
Victor Stinnerc4b49542011-12-11 22:44:26 +010013162 if (PyUnicode_GET_LENGTH(self) >= width)
13163 return unicode_result_unchanged(self);
13164
13165 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013166
13167 u = pad(self, fill, 0, '0');
13168
Walter Dörwald068325e2002-04-15 13:36:47 +000013169 if (u == NULL)
13170 return NULL;
13171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013172 kind = PyUnicode_KIND(u);
13173 data = PyUnicode_DATA(u);
13174 chr = PyUnicode_READ(kind, data, fill);
13175
13176 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013177 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013178 PyUnicode_WRITE(kind, data, 0, chr);
13179 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013180 }
13181
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013182 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013183 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013184}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013185
13186#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013187static PyObject *
13188unicode__decimal2ascii(PyObject *self)
13189{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013190 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013191}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013192#endif
13193
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013194PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013195 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013196\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013197Return True if S starts with the specified prefix, False otherwise.\n\
13198With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013199With optional end, stop comparing S at that position.\n\
13200prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013201
13202static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013203unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013204 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013205{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013206 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013207 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013208 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013209 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013210 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013211
Jesus Ceaac451502011-04-20 17:09:23 +020013212 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013213 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013214 if (PyTuple_Check(subobj)) {
13215 Py_ssize_t i;
13216 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013217 substring = PyTuple_GET_ITEM(subobj, i);
13218 if (!PyUnicode_Check(substring)) {
13219 PyErr_Format(PyExc_TypeError,
13220 "tuple for startswith must only contain str, "
13221 "not %.100s",
13222 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013223 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013224 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013225 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013226 if (result == -1)
13227 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013228 if (result) {
13229 Py_RETURN_TRUE;
13230 }
13231 }
13232 /* nothing matched */
13233 Py_RETURN_FALSE;
13234 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013235 if (!PyUnicode_Check(subobj)) {
13236 PyErr_Format(PyExc_TypeError,
13237 "startswith first arg must be str or "
13238 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013239 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013240 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013241 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013242 if (result == -1)
13243 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013244 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013245}
13246
13247
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013248PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013249 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013250\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013251Return True if S ends with the specified suffix, False otherwise.\n\
13252With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013253With optional end, stop comparing S at that position.\n\
13254suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013255
13256static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013257unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013258 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013259{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013260 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013261 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013262 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013263 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013264 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013265
Jesus Ceaac451502011-04-20 17:09:23 +020013266 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013267 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013268 if (PyTuple_Check(subobj)) {
13269 Py_ssize_t i;
13270 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013271 substring = PyTuple_GET_ITEM(subobj, i);
13272 if (!PyUnicode_Check(substring)) {
13273 PyErr_Format(PyExc_TypeError,
13274 "tuple for endswith must only contain str, "
13275 "not %.100s",
13276 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013277 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013278 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013279 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013280 if (result == -1)
13281 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013282 if (result) {
13283 Py_RETURN_TRUE;
13284 }
13285 }
13286 Py_RETURN_FALSE;
13287 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013288 if (!PyUnicode_Check(subobj)) {
13289 PyErr_Format(PyExc_TypeError,
13290 "endswith first arg must be str or "
13291 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013292 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013293 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013294 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013295 if (result == -1)
13296 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013297 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013298}
13299
Victor Stinner202fdca2012-05-07 12:47:02 +020013300Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013301_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013302{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013303 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13304 writer->data = PyUnicode_DATA(writer->buffer);
13305
13306 if (!writer->readonly) {
13307 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013308 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013309 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013310 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013311 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13312 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13313 writer->kind = PyUnicode_WCHAR_KIND;
13314 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13315
Victor Stinner8f674cc2013-04-17 23:02:17 +020013316 /* Copy-on-write mode: set buffer size to 0 so
13317 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13318 * next write. */
13319 writer->size = 0;
13320 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013321}
13322
Victor Stinnerd3f08822012-05-29 12:57:52 +020013323void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013324_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013325{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013326 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013327
13328 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013329 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013330
13331 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13332 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13333 writer->kind = PyUnicode_WCHAR_KIND;
13334 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013335}
13336
Victor Stinnerd3f08822012-05-29 12:57:52 +020013337int
13338_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13339 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013340{
13341 Py_ssize_t newlen;
13342 PyObject *newbuffer;
13343
Victor Stinnerca9381e2015-09-22 00:58:32 +020013344 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013345 assert((maxchar > writer->maxchar && length >= 0)
13346 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013347
Victor Stinner202fdca2012-05-07 12:47:02 +020013348 if (length > PY_SSIZE_T_MAX - writer->pos) {
13349 PyErr_NoMemory();
13350 return -1;
13351 }
13352 newlen = writer->pos + length;
13353
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013354 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013355
Victor Stinnerd3f08822012-05-29 12:57:52 +020013356 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013357 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013358 if (writer->overallocate
13359 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13360 /* overallocate to limit the number of realloc() */
13361 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013362 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013363 if (newlen < writer->min_length)
13364 newlen = writer->min_length;
13365
Victor Stinnerd3f08822012-05-29 12:57:52 +020013366 writer->buffer = PyUnicode_New(newlen, maxchar);
13367 if (writer->buffer == NULL)
13368 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013369 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013370 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013371 if (writer->overallocate
13372 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13373 /* overallocate to limit the number of realloc() */
13374 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013375 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013376 if (newlen < writer->min_length)
13377 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013378
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013379 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013380 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013381 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013382 newbuffer = PyUnicode_New(newlen, maxchar);
13383 if (newbuffer == NULL)
13384 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013385 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13386 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013387 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013388 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013389 }
13390 else {
13391 newbuffer = resize_compact(writer->buffer, newlen);
13392 if (newbuffer == NULL)
13393 return -1;
13394 }
13395 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013396 }
13397 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013398 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013399 newbuffer = PyUnicode_New(writer->size, maxchar);
13400 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013401 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013402 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13403 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013404 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013405 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013406 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013407 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013408
13409#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013410}
13411
Victor Stinnerca9381e2015-09-22 00:58:32 +020013412int
13413_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13414 enum PyUnicode_Kind kind)
13415{
13416 Py_UCS4 maxchar;
13417
13418 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13419 assert(writer->kind < kind);
13420
13421 switch (kind)
13422 {
13423 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13424 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13425 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13426 default:
13427 assert(0 && "invalid kind");
13428 return -1;
13429 }
13430
13431 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13432}
13433
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013434Py_LOCAL_INLINE(int)
13435_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013436{
13437 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13438 return -1;
13439 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13440 writer->pos++;
13441 return 0;
13442}
13443
13444int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013445_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13446{
13447 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13448}
13449
13450int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013451_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13452{
13453 Py_UCS4 maxchar;
13454 Py_ssize_t len;
13455
13456 if (PyUnicode_READY(str) == -1)
13457 return -1;
13458 len = PyUnicode_GET_LENGTH(str);
13459 if (len == 0)
13460 return 0;
13461 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13462 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013463 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013464 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013465 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013466 Py_INCREF(str);
13467 writer->buffer = str;
13468 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013469 writer->pos += len;
13470 return 0;
13471 }
13472 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13473 return -1;
13474 }
13475 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13476 str, 0, len);
13477 writer->pos += len;
13478 return 0;
13479}
13480
Victor Stinnere215d962012-10-06 23:03:36 +020013481int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013482_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13483 Py_ssize_t start, Py_ssize_t end)
13484{
13485 Py_UCS4 maxchar;
13486 Py_ssize_t len;
13487
13488 if (PyUnicode_READY(str) == -1)
13489 return -1;
13490
13491 assert(0 <= start);
13492 assert(end <= PyUnicode_GET_LENGTH(str));
13493 assert(start <= end);
13494
13495 if (end == 0)
13496 return 0;
13497
13498 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13499 return _PyUnicodeWriter_WriteStr(writer, str);
13500
13501 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13502 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13503 else
13504 maxchar = writer->maxchar;
13505 len = end - start;
13506
13507 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13508 return -1;
13509
13510 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13511 str, start, len);
13512 writer->pos += len;
13513 return 0;
13514}
13515
13516int
Victor Stinner4a587072013-11-19 12:54:53 +010013517_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13518 const char *ascii, Py_ssize_t len)
13519{
13520 if (len == -1)
13521 len = strlen(ascii);
13522
13523 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13524
13525 if (writer->buffer == NULL && !writer->overallocate) {
13526 PyObject *str;
13527
13528 str = _PyUnicode_FromASCII(ascii, len);
13529 if (str == NULL)
13530 return -1;
13531
13532 writer->readonly = 1;
13533 writer->buffer = str;
13534 _PyUnicodeWriter_Update(writer);
13535 writer->pos += len;
13536 return 0;
13537 }
13538
13539 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13540 return -1;
13541
13542 switch (writer->kind)
13543 {
13544 case PyUnicode_1BYTE_KIND:
13545 {
13546 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13547 Py_UCS1 *data = writer->data;
13548
13549 Py_MEMCPY(data + writer->pos, str, len);
13550 break;
13551 }
13552 case PyUnicode_2BYTE_KIND:
13553 {
13554 _PyUnicode_CONVERT_BYTES(
13555 Py_UCS1, Py_UCS2,
13556 ascii, ascii + len,
13557 (Py_UCS2 *)writer->data + writer->pos);
13558 break;
13559 }
13560 case PyUnicode_4BYTE_KIND:
13561 {
13562 _PyUnicode_CONVERT_BYTES(
13563 Py_UCS1, Py_UCS4,
13564 ascii, ascii + len,
13565 (Py_UCS4 *)writer->data + writer->pos);
13566 break;
13567 }
13568 default:
13569 assert(0);
13570 }
13571
13572 writer->pos += len;
13573 return 0;
13574}
13575
13576int
13577_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13578 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013579{
13580 Py_UCS4 maxchar;
13581
13582 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13583 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13584 return -1;
13585 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13586 writer->pos += len;
13587 return 0;
13588}
13589
Victor Stinnerd3f08822012-05-29 12:57:52 +020013590PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013591_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013592{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013593 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013594 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013595 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013596 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013597 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013598 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013599 str = writer->buffer;
13600 writer->buffer = NULL;
13601 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13602 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013603 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013604 if (writer->pos == 0) {
13605 Py_CLEAR(writer->buffer);
13606
13607 /* Get the empty Unicode string singleton ('') */
13608 _Py_INCREF_UNICODE_EMPTY();
13609 str = unicode_empty;
Victor Stinner202fdca2012-05-07 12:47:02 +020013610 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013611 else {
13612 str = writer->buffer;
13613 writer->buffer = NULL;
13614
13615 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13616 PyObject *str2;
13617 str2 = resize_compact(str, writer->pos);
13618 if (str2 == NULL)
13619 return NULL;
13620 str = str2;
13621 }
13622 }
13623
Victor Stinner15a0bd32013-07-08 22:29:55 +020013624 assert(_PyUnicode_CheckConsistency(str, 1));
13625 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013626}
13627
Victor Stinnerd3f08822012-05-29 12:57:52 +020013628void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013629_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013630{
13631 Py_CLEAR(writer->buffer);
13632}
13633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013634#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013635
13636PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013637 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013638\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013639Return a formatted version of S, using substitutions from args and kwargs.\n\
13640The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013641
Eric Smith27bbca62010-11-04 17:06:58 +000013642PyDoc_STRVAR(format_map__doc__,
13643 "S.format_map(mapping) -> str\n\
13644\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013645Return a formatted version of S, using substitutions from mapping.\n\
13646The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013647
Eric Smith4a7d76d2008-05-30 18:10:19 +000013648static PyObject *
13649unicode__format__(PyObject* self, PyObject* args)
13650{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013651 PyObject *format_spec;
13652 _PyUnicodeWriter writer;
13653 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013654
13655 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13656 return NULL;
13657
Victor Stinnerd3f08822012-05-29 12:57:52 +020013658 if (PyUnicode_READY(self) == -1)
13659 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013660 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013661 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13662 self, format_spec, 0,
13663 PyUnicode_GET_LENGTH(format_spec));
13664 if (ret == -1) {
13665 _PyUnicodeWriter_Dealloc(&writer);
13666 return NULL;
13667 }
13668 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013669}
13670
Eric Smith8c663262007-08-25 02:26:07 +000013671PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013672 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013673\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013674Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013675
13676static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013677unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013679 Py_ssize_t size;
13680
13681 /* If it's a compact object, account for base structure +
13682 character data. */
13683 if (PyUnicode_IS_COMPACT_ASCII(v))
13684 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13685 else if (PyUnicode_IS_COMPACT(v))
13686 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013687 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013688 else {
13689 /* If it is a two-block object, account for base object, and
13690 for character block if present. */
13691 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013692 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013693 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013694 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013695 }
13696 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013697 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013698 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013699 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013700 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013701 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013702
13703 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013704}
13705
13706PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013707 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013708
13709static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013710unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013711{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013712 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013713 if (!copy)
13714 return NULL;
13715 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013716}
13717
Guido van Rossumd57fd912000-03-10 22:53:23 +000013718static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013719 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013720 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013721 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13722 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013723 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13724 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013725 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013726 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13727 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13728 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013729 {"expandtabs", (PyCFunction) unicode_expandtabs,
13730 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013731 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013732 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013733 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13734 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13735 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013736 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013737 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13738 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13739 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013740 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013741 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013742 {"splitlines", (PyCFunction) unicode_splitlines,
13743 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013744 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013745 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13746 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13747 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13748 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13749 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13750 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13751 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13752 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13753 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13754 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13755 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13756 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13757 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13758 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013759 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013760 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013761 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013762 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013763 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013764 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013765 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013766 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013767#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013768 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013769 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013770#endif
13771
Benjamin Peterson14339b62009-01-31 16:36:08 +000013772 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013773 {NULL, NULL}
13774};
13775
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013776static PyObject *
13777unicode_mod(PyObject *v, PyObject *w)
13778{
Brian Curtindfc80e32011-08-10 20:28:54 -050013779 if (!PyUnicode_Check(v))
13780 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013781 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013782}
13783
13784static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013785 0, /*nb_add*/
13786 0, /*nb_subtract*/
13787 0, /*nb_multiply*/
13788 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013789};
13790
Guido van Rossumd57fd912000-03-10 22:53:23 +000013791static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013792 (lenfunc) unicode_length, /* sq_length */
13793 PyUnicode_Concat, /* sq_concat */
13794 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13795 (ssizeargfunc) unicode_getitem, /* sq_item */
13796 0, /* sq_slice */
13797 0, /* sq_ass_item */
13798 0, /* sq_ass_slice */
13799 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013800};
13801
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013802static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013803unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013804{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013805 if (PyUnicode_READY(self) == -1)
13806 return NULL;
13807
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013808 if (PyIndex_Check(item)) {
13809 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013810 if (i == -1 && PyErr_Occurred())
13811 return NULL;
13812 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013813 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013814 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013815 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013816 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013817 PyObject *result;
13818 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013819 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013820 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013822 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013823 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013824 return NULL;
13825 }
13826
13827 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013828 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013829 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013830 slicelength == PyUnicode_GET_LENGTH(self)) {
13831 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013832 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013833 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013834 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013835 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013836 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013837 src_kind = PyUnicode_KIND(self);
13838 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013839 if (!PyUnicode_IS_ASCII(self)) {
13840 kind_limit = kind_maxchar_limit(src_kind);
13841 max_char = 0;
13842 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13843 ch = PyUnicode_READ(src_kind, src_data, cur);
13844 if (ch > max_char) {
13845 max_char = ch;
13846 if (max_char >= kind_limit)
13847 break;
13848 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013849 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013850 }
Victor Stinner55c99112011-10-13 01:17:06 +020013851 else
13852 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013853 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013854 if (result == NULL)
13855 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013856 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013857 dest_data = PyUnicode_DATA(result);
13858
13859 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013860 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13861 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013862 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013863 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013864 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013865 } else {
13866 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13867 return NULL;
13868 }
13869}
13870
13871static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013872 (lenfunc)unicode_length, /* mp_length */
13873 (binaryfunc)unicode_subscript, /* mp_subscript */
13874 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013875};
13876
Guido van Rossumd57fd912000-03-10 22:53:23 +000013877
Guido van Rossumd57fd912000-03-10 22:53:23 +000013878/* Helpers for PyUnicode_Format() */
13879
Victor Stinnera47082312012-10-04 02:19:54 +020013880struct unicode_formatter_t {
13881 PyObject *args;
13882 int args_owned;
13883 Py_ssize_t arglen, argidx;
13884 PyObject *dict;
13885
13886 enum PyUnicode_Kind fmtkind;
13887 Py_ssize_t fmtcnt, fmtpos;
13888 void *fmtdata;
13889 PyObject *fmtstr;
13890
13891 _PyUnicodeWriter writer;
13892};
13893
13894struct unicode_format_arg_t {
13895 Py_UCS4 ch;
13896 int flags;
13897 Py_ssize_t width;
13898 int prec;
13899 int sign;
13900};
13901
Guido van Rossumd57fd912000-03-10 22:53:23 +000013902static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013903unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013904{
Victor Stinnera47082312012-10-04 02:19:54 +020013905 Py_ssize_t argidx = ctx->argidx;
13906
13907 if (argidx < ctx->arglen) {
13908 ctx->argidx++;
13909 if (ctx->arglen < 0)
13910 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013911 else
Victor Stinnera47082312012-10-04 02:19:54 +020013912 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013913 }
13914 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013915 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013916 return NULL;
13917}
13918
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013919/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013920
Victor Stinnera47082312012-10-04 02:19:54 +020013921/* Format a float into the writer if the writer is not NULL, or into *p_output
13922 otherwise.
13923
13924 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013925static int
Victor Stinnera47082312012-10-04 02:19:54 +020013926formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13927 PyObject **p_output,
13928 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013929{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013930 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013931 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013932 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013933 int prec;
13934 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013935
Guido van Rossumd57fd912000-03-10 22:53:23 +000013936 x = PyFloat_AsDouble(v);
13937 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013938 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013939
Victor Stinnera47082312012-10-04 02:19:54 +020013940 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013941 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013942 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013943
Victor Stinnera47082312012-10-04 02:19:54 +020013944 if (arg->flags & F_ALT)
13945 dtoa_flags = Py_DTSF_ALT;
13946 else
13947 dtoa_flags = 0;
13948 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013949 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013950 return -1;
13951 len = strlen(p);
13952 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013953 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013954 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013955 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013956 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013957 }
13958 else
13959 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013960 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013961 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013962}
13963
Victor Stinnerd0880d52012-04-27 23:40:13 +020013964/* formatlong() emulates the format codes d, u, o, x and X, and
13965 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13966 * Python's regular ints.
13967 * Return value: a new PyUnicodeObject*, or NULL if error.
13968 * The output string is of the form
13969 * "-"? ("0x" | "0X")? digit+
13970 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13971 * set in flags. The case of hex digits will be correct,
13972 * There will be at least prec digits, zero-filled on the left if
13973 * necessary to get that many.
13974 * val object to be converted
13975 * flags bitmask of format flags; only F_ALT is looked at
13976 * prec minimum number of digits; 0-fill on left if needed
13977 * type a character in [duoxX]; u acts the same as d
13978 *
13979 * CAUTION: o, x and X conversions on regular ints can never
13980 * produce a '-' sign, but can for Python's unbounded ints.
13981 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013982PyObject *
13983_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013984{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013985 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013986 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013987 Py_ssize_t i;
13988 int sign; /* 1 if '-', else 0 */
13989 int len; /* number of characters */
13990 Py_ssize_t llen;
13991 int numdigits; /* len == numnondigits + numdigits */
13992 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013993
Victor Stinnerd0880d52012-04-27 23:40:13 +020013994 /* Avoid exceeding SSIZE_T_MAX */
13995 if (prec > INT_MAX-3) {
13996 PyErr_SetString(PyExc_OverflowError,
13997 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013998 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013999 }
14000
14001 assert(PyLong_Check(val));
14002
14003 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014004 default:
14005 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014006 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014007 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014008 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014009 /* int and int subclasses should print numerically when a numeric */
14010 /* format code is used (see issue18780) */
14011 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014012 break;
14013 case 'o':
14014 numnondigits = 2;
14015 result = PyNumber_ToBase(val, 8);
14016 break;
14017 case 'x':
14018 case 'X':
14019 numnondigits = 2;
14020 result = PyNumber_ToBase(val, 16);
14021 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014022 }
14023 if (!result)
14024 return NULL;
14025
14026 assert(unicode_modifiable(result));
14027 assert(PyUnicode_IS_READY(result));
14028 assert(PyUnicode_IS_ASCII(result));
14029
14030 /* To modify the string in-place, there can only be one reference. */
14031 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014032 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014033 PyErr_BadInternalCall();
14034 return NULL;
14035 }
14036 buf = PyUnicode_DATA(result);
14037 llen = PyUnicode_GET_LENGTH(result);
14038 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014039 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014040 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014041 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014042 return NULL;
14043 }
14044 len = (int)llen;
14045 sign = buf[0] == '-';
14046 numnondigits += sign;
14047 numdigits = len - numnondigits;
14048 assert(numdigits > 0);
14049
14050 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014051 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014052 (type == 'o' || type == 'x' || type == 'X'))) {
14053 assert(buf[sign] == '0');
14054 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14055 buf[sign+1] == 'o');
14056 numnondigits -= 2;
14057 buf += 2;
14058 len -= 2;
14059 if (sign)
14060 buf[0] = '-';
14061 assert(len == numnondigits + numdigits);
14062 assert(numdigits > 0);
14063 }
14064
14065 /* Fill with leading zeroes to meet minimum width. */
14066 if (prec > numdigits) {
14067 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14068 numnondigits + prec);
14069 char *b1;
14070 if (!r1) {
14071 Py_DECREF(result);
14072 return NULL;
14073 }
14074 b1 = PyBytes_AS_STRING(r1);
14075 for (i = 0; i < numnondigits; ++i)
14076 *b1++ = *buf++;
14077 for (i = 0; i < prec - numdigits; i++)
14078 *b1++ = '0';
14079 for (i = 0; i < numdigits; i++)
14080 *b1++ = *buf++;
14081 *b1 = '\0';
14082 Py_DECREF(result);
14083 result = r1;
14084 buf = PyBytes_AS_STRING(result);
14085 len = numnondigits + prec;
14086 }
14087
14088 /* Fix up case for hex conversions. */
14089 if (type == 'X') {
14090 /* Need to convert all lower case letters to upper case.
14091 and need to convert 0x to 0X (and -0x to -0X). */
14092 for (i = 0; i < len; i++)
14093 if (buf[i] >= 'a' && buf[i] <= 'x')
14094 buf[i] -= 'a'-'A';
14095 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014096 if (!PyUnicode_Check(result)
14097 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014098 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014099 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014100 Py_DECREF(result);
14101 result = unicode;
14102 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014103 else if (len != PyUnicode_GET_LENGTH(result)) {
14104 if (PyUnicode_Resize(&result, len) < 0)
14105 Py_CLEAR(result);
14106 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014107 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014108}
14109
Ethan Furmandf3ed242014-01-05 06:50:30 -080014110/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014111 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014112 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014113 * -1 and raise an exception on error */
14114static int
Victor Stinnera47082312012-10-04 02:19:54 +020014115mainformatlong(PyObject *v,
14116 struct unicode_format_arg_t *arg,
14117 PyObject **p_output,
14118 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014119{
14120 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014121 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014122
14123 if (!PyNumber_Check(v))
14124 goto wrongtype;
14125
Ethan Furman9ab74802014-03-21 06:38:46 -070014126 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014127 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014128 if (type == 'o' || type == 'x' || type == 'X') {
14129 iobj = PyNumber_Index(v);
14130 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014131 if (PyErr_ExceptionMatches(PyExc_TypeError))
14132 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014133 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014134 }
14135 }
14136 else {
14137 iobj = PyNumber_Long(v);
14138 if (iobj == NULL ) {
14139 if (PyErr_ExceptionMatches(PyExc_TypeError))
14140 goto wrongtype;
14141 return -1;
14142 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014143 }
14144 assert(PyLong_Check(iobj));
14145 }
14146 else {
14147 iobj = v;
14148 Py_INCREF(iobj);
14149 }
14150
14151 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014152 && arg->width == -1 && arg->prec == -1
14153 && !(arg->flags & (F_SIGN | F_BLANK))
14154 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014155 {
14156 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014157 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014158 int base;
14159
Victor Stinnera47082312012-10-04 02:19:54 +020014160 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014161 {
14162 default:
14163 assert(0 && "'type' not in [diuoxX]");
14164 case 'd':
14165 case 'i':
14166 case 'u':
14167 base = 10;
14168 break;
14169 case 'o':
14170 base = 8;
14171 break;
14172 case 'x':
14173 case 'X':
14174 base = 16;
14175 break;
14176 }
14177
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014178 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14179 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014180 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014181 }
14182 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014183 return 1;
14184 }
14185
Ethan Furmanb95b5612015-01-23 20:05:18 -080014186 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014187 Py_DECREF(iobj);
14188 if (res == NULL)
14189 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014190 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014191 return 0;
14192
14193wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014194 switch(type)
14195 {
14196 case 'o':
14197 case 'x':
14198 case 'X':
14199 PyErr_Format(PyExc_TypeError,
14200 "%%%c format: an integer is required, "
14201 "not %.200s",
14202 type, Py_TYPE(v)->tp_name);
14203 break;
14204 default:
14205 PyErr_Format(PyExc_TypeError,
14206 "%%%c format: a number is required, "
14207 "not %.200s",
14208 type, Py_TYPE(v)->tp_name);
14209 break;
14210 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014211 return -1;
14212}
14213
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014214static Py_UCS4
14215formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014216{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014217 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014218 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014219 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014220 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014221 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014222 goto onError;
14223 }
14224 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014225 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014226 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014227 /* make sure number is a type of integer */
14228 if (!PyLong_Check(v)) {
14229 iobj = PyNumber_Index(v);
14230 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014231 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014232 }
14233 v = iobj;
14234 Py_DECREF(iobj);
14235 }
14236 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014237 x = PyLong_AsLong(v);
14238 if (x == -1 && PyErr_Occurred())
14239 goto onError;
14240
Victor Stinner8faf8212011-12-08 22:14:11 +010014241 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014242 PyErr_SetString(PyExc_OverflowError,
14243 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014244 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014245 }
14246
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014247 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014248 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014249
Benjamin Peterson29060642009-01-31 22:14:21 +000014250 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014251 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014252 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014253 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014254}
14255
Victor Stinnera47082312012-10-04 02:19:54 +020014256/* Parse options of an argument: flags, width, precision.
14257 Handle also "%(name)" syntax.
14258
14259 Return 0 if the argument has been formatted into arg->str.
14260 Return 1 if the argument has been written into ctx->writer,
14261 Raise an exception and return -1 on error. */
14262static int
14263unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14264 struct unicode_format_arg_t *arg)
14265{
14266#define FORMAT_READ(ctx) \
14267 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14268
14269 PyObject *v;
14270
Victor Stinnera47082312012-10-04 02:19:54 +020014271 if (arg->ch == '(') {
14272 /* Get argument value from a dictionary. Example: "%(name)s". */
14273 Py_ssize_t keystart;
14274 Py_ssize_t keylen;
14275 PyObject *key;
14276 int pcount = 1;
14277
14278 if (ctx->dict == NULL) {
14279 PyErr_SetString(PyExc_TypeError,
14280 "format requires a mapping");
14281 return -1;
14282 }
14283 ++ctx->fmtpos;
14284 --ctx->fmtcnt;
14285 keystart = ctx->fmtpos;
14286 /* Skip over balanced parentheses */
14287 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14288 arg->ch = FORMAT_READ(ctx);
14289 if (arg->ch == ')')
14290 --pcount;
14291 else if (arg->ch == '(')
14292 ++pcount;
14293 ctx->fmtpos++;
14294 }
14295 keylen = ctx->fmtpos - keystart - 1;
14296 if (ctx->fmtcnt < 0 || pcount > 0) {
14297 PyErr_SetString(PyExc_ValueError,
14298 "incomplete format key");
14299 return -1;
14300 }
14301 key = PyUnicode_Substring(ctx->fmtstr,
14302 keystart, keystart + keylen);
14303 if (key == NULL)
14304 return -1;
14305 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014306 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014307 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014308 }
14309 ctx->args = PyObject_GetItem(ctx->dict, key);
14310 Py_DECREF(key);
14311 if (ctx->args == NULL)
14312 return -1;
14313 ctx->args_owned = 1;
14314 ctx->arglen = -1;
14315 ctx->argidx = -2;
14316 }
14317
14318 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014319 while (--ctx->fmtcnt >= 0) {
14320 arg->ch = FORMAT_READ(ctx);
14321 ctx->fmtpos++;
14322 switch (arg->ch) {
14323 case '-': arg->flags |= F_LJUST; continue;
14324 case '+': arg->flags |= F_SIGN; continue;
14325 case ' ': arg->flags |= F_BLANK; continue;
14326 case '#': arg->flags |= F_ALT; continue;
14327 case '0': arg->flags |= F_ZERO; continue;
14328 }
14329 break;
14330 }
14331
14332 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014333 if (arg->ch == '*') {
14334 v = unicode_format_getnextarg(ctx);
14335 if (v == NULL)
14336 return -1;
14337 if (!PyLong_Check(v)) {
14338 PyErr_SetString(PyExc_TypeError,
14339 "* wants int");
14340 return -1;
14341 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014342 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014343 if (arg->width == -1 && PyErr_Occurred())
14344 return -1;
14345 if (arg->width < 0) {
14346 arg->flags |= F_LJUST;
14347 arg->width = -arg->width;
14348 }
14349 if (--ctx->fmtcnt >= 0) {
14350 arg->ch = FORMAT_READ(ctx);
14351 ctx->fmtpos++;
14352 }
14353 }
14354 else if (arg->ch >= '0' && arg->ch <= '9') {
14355 arg->width = arg->ch - '0';
14356 while (--ctx->fmtcnt >= 0) {
14357 arg->ch = FORMAT_READ(ctx);
14358 ctx->fmtpos++;
14359 if (arg->ch < '0' || arg->ch > '9')
14360 break;
14361 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14362 mixing signed and unsigned comparison. Since arg->ch is between
14363 '0' and '9', casting to int is safe. */
14364 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14365 PyErr_SetString(PyExc_ValueError,
14366 "width too big");
14367 return -1;
14368 }
14369 arg->width = arg->width*10 + (arg->ch - '0');
14370 }
14371 }
14372
14373 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014374 if (arg->ch == '.') {
14375 arg->prec = 0;
14376 if (--ctx->fmtcnt >= 0) {
14377 arg->ch = FORMAT_READ(ctx);
14378 ctx->fmtpos++;
14379 }
14380 if (arg->ch == '*') {
14381 v = unicode_format_getnextarg(ctx);
14382 if (v == NULL)
14383 return -1;
14384 if (!PyLong_Check(v)) {
14385 PyErr_SetString(PyExc_TypeError,
14386 "* wants int");
14387 return -1;
14388 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014389 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014390 if (arg->prec == -1 && PyErr_Occurred())
14391 return -1;
14392 if (arg->prec < 0)
14393 arg->prec = 0;
14394 if (--ctx->fmtcnt >= 0) {
14395 arg->ch = FORMAT_READ(ctx);
14396 ctx->fmtpos++;
14397 }
14398 }
14399 else if (arg->ch >= '0' && arg->ch <= '9') {
14400 arg->prec = arg->ch - '0';
14401 while (--ctx->fmtcnt >= 0) {
14402 arg->ch = FORMAT_READ(ctx);
14403 ctx->fmtpos++;
14404 if (arg->ch < '0' || arg->ch > '9')
14405 break;
14406 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14407 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014408 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014409 return -1;
14410 }
14411 arg->prec = arg->prec*10 + (arg->ch - '0');
14412 }
14413 }
14414 }
14415
14416 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14417 if (ctx->fmtcnt >= 0) {
14418 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14419 if (--ctx->fmtcnt >= 0) {
14420 arg->ch = FORMAT_READ(ctx);
14421 ctx->fmtpos++;
14422 }
14423 }
14424 }
14425 if (ctx->fmtcnt < 0) {
14426 PyErr_SetString(PyExc_ValueError,
14427 "incomplete format");
14428 return -1;
14429 }
14430 return 0;
14431
14432#undef FORMAT_READ
14433}
14434
14435/* Format one argument. Supported conversion specifiers:
14436
14437 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014438 - "i", "d", "u": int or float
14439 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014440 - "e", "E", "f", "F", "g", "G": float
14441 - "c": int or str (1 character)
14442
Victor Stinner8dbd4212012-12-04 09:30:24 +010014443 When possible, the output is written directly into the Unicode writer
14444 (ctx->writer). A string is created when padding is required.
14445
Victor Stinnera47082312012-10-04 02:19:54 +020014446 Return 0 if the argument has been formatted into *p_str,
14447 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014448 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014449static int
14450unicode_format_arg_format(struct unicode_formatter_t *ctx,
14451 struct unicode_format_arg_t *arg,
14452 PyObject **p_str)
14453{
14454 PyObject *v;
14455 _PyUnicodeWriter *writer = &ctx->writer;
14456
14457 if (ctx->fmtcnt == 0)
14458 ctx->writer.overallocate = 0;
14459
14460 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014461 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014462 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014463 return 1;
14464 }
14465
14466 v = unicode_format_getnextarg(ctx);
14467 if (v == NULL)
14468 return -1;
14469
Victor Stinnera47082312012-10-04 02:19:54 +020014470
14471 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014472 case 's':
14473 case 'r':
14474 case 'a':
14475 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14476 /* Fast path */
14477 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14478 return -1;
14479 return 1;
14480 }
14481
14482 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14483 *p_str = v;
14484 Py_INCREF(*p_str);
14485 }
14486 else {
14487 if (arg->ch == 's')
14488 *p_str = PyObject_Str(v);
14489 else if (arg->ch == 'r')
14490 *p_str = PyObject_Repr(v);
14491 else
14492 *p_str = PyObject_ASCII(v);
14493 }
14494 break;
14495
14496 case 'i':
14497 case 'd':
14498 case 'u':
14499 case 'o':
14500 case 'x':
14501 case 'X':
14502 {
14503 int ret = mainformatlong(v, arg, p_str, writer);
14504 if (ret != 0)
14505 return ret;
14506 arg->sign = 1;
14507 break;
14508 }
14509
14510 case 'e':
14511 case 'E':
14512 case 'f':
14513 case 'F':
14514 case 'g':
14515 case 'G':
14516 if (arg->width == -1 && arg->prec == -1
14517 && !(arg->flags & (F_SIGN | F_BLANK)))
14518 {
14519 /* Fast path */
14520 if (formatfloat(v, arg, NULL, writer) == -1)
14521 return -1;
14522 return 1;
14523 }
14524
14525 arg->sign = 1;
14526 if (formatfloat(v, arg, p_str, NULL) == -1)
14527 return -1;
14528 break;
14529
14530 case 'c':
14531 {
14532 Py_UCS4 ch = formatchar(v);
14533 if (ch == (Py_UCS4) -1)
14534 return -1;
14535 if (arg->width == -1 && arg->prec == -1) {
14536 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014537 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014538 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014539 return 1;
14540 }
14541 *p_str = PyUnicode_FromOrdinal(ch);
14542 break;
14543 }
14544
14545 default:
14546 PyErr_Format(PyExc_ValueError,
14547 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014548 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014549 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14550 (int)arg->ch,
14551 ctx->fmtpos - 1);
14552 return -1;
14553 }
14554 if (*p_str == NULL)
14555 return -1;
14556 assert (PyUnicode_Check(*p_str));
14557 return 0;
14558}
14559
14560static int
14561unicode_format_arg_output(struct unicode_formatter_t *ctx,
14562 struct unicode_format_arg_t *arg,
14563 PyObject *str)
14564{
14565 Py_ssize_t len;
14566 enum PyUnicode_Kind kind;
14567 void *pbuf;
14568 Py_ssize_t pindex;
14569 Py_UCS4 signchar;
14570 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014571 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014572 Py_ssize_t sublen;
14573 _PyUnicodeWriter *writer = &ctx->writer;
14574 Py_UCS4 fill;
14575
14576 fill = ' ';
14577 if (arg->sign && arg->flags & F_ZERO)
14578 fill = '0';
14579
14580 if (PyUnicode_READY(str) == -1)
14581 return -1;
14582
14583 len = PyUnicode_GET_LENGTH(str);
14584 if ((arg->width == -1 || arg->width <= len)
14585 && (arg->prec == -1 || arg->prec >= len)
14586 && !(arg->flags & (F_SIGN | F_BLANK)))
14587 {
14588 /* Fast path */
14589 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14590 return -1;
14591 return 0;
14592 }
14593
14594 /* Truncate the string for "s", "r" and "a" formats
14595 if the precision is set */
14596 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14597 if (arg->prec >= 0 && len > arg->prec)
14598 len = arg->prec;
14599 }
14600
14601 /* Adjust sign and width */
14602 kind = PyUnicode_KIND(str);
14603 pbuf = PyUnicode_DATA(str);
14604 pindex = 0;
14605 signchar = '\0';
14606 if (arg->sign) {
14607 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14608 if (ch == '-' || ch == '+') {
14609 signchar = ch;
14610 len--;
14611 pindex++;
14612 }
14613 else if (arg->flags & F_SIGN)
14614 signchar = '+';
14615 else if (arg->flags & F_BLANK)
14616 signchar = ' ';
14617 else
14618 arg->sign = 0;
14619 }
14620 if (arg->width < len)
14621 arg->width = len;
14622
14623 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014624 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014625 if (!(arg->flags & F_LJUST)) {
14626 if (arg->sign) {
14627 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014628 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014629 }
14630 else {
14631 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014632 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014633 }
14634 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014635 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14636 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014637 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014638 }
14639
Victor Stinnera47082312012-10-04 02:19:54 +020014640 buflen = arg->width;
14641 if (arg->sign && len == arg->width)
14642 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014643 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014644 return -1;
14645
14646 /* Write the sign if needed */
14647 if (arg->sign) {
14648 if (fill != ' ') {
14649 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14650 writer->pos += 1;
14651 }
14652 if (arg->width > len)
14653 arg->width--;
14654 }
14655
14656 /* Write the numeric prefix for "x", "X" and "o" formats
14657 if the alternate form is used.
14658 For example, write "0x" for the "%#x" format. */
14659 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14660 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14661 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14662 if (fill != ' ') {
14663 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14664 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14665 writer->pos += 2;
14666 pindex += 2;
14667 }
14668 arg->width -= 2;
14669 if (arg->width < 0)
14670 arg->width = 0;
14671 len -= 2;
14672 }
14673
14674 /* Pad left with the fill character if needed */
14675 if (arg->width > len && !(arg->flags & F_LJUST)) {
14676 sublen = arg->width - len;
14677 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14678 writer->pos += sublen;
14679 arg->width = len;
14680 }
14681
14682 /* If padding with spaces: write sign if needed and/or numeric prefix if
14683 the alternate form is used */
14684 if (fill == ' ') {
14685 if (arg->sign) {
14686 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14687 writer->pos += 1;
14688 }
14689 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14690 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14691 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14692 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14693 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14694 writer->pos += 2;
14695 pindex += 2;
14696 }
14697 }
14698
14699 /* Write characters */
14700 if (len) {
14701 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14702 str, pindex, len);
14703 writer->pos += len;
14704 }
14705
14706 /* Pad right with the fill character if needed */
14707 if (arg->width > len) {
14708 sublen = arg->width - len;
14709 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14710 writer->pos += sublen;
14711 }
14712 return 0;
14713}
14714
14715/* Helper of PyUnicode_Format(): format one arg.
14716 Return 0 on success, raise an exception and return -1 on error. */
14717static int
14718unicode_format_arg(struct unicode_formatter_t *ctx)
14719{
14720 struct unicode_format_arg_t arg;
14721 PyObject *str;
14722 int ret;
14723
Victor Stinner8dbd4212012-12-04 09:30:24 +010014724 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14725 arg.flags = 0;
14726 arg.width = -1;
14727 arg.prec = -1;
14728 arg.sign = 0;
14729 str = NULL;
14730
Victor Stinnera47082312012-10-04 02:19:54 +020014731 ret = unicode_format_arg_parse(ctx, &arg);
14732 if (ret == -1)
14733 return -1;
14734
14735 ret = unicode_format_arg_format(ctx, &arg, &str);
14736 if (ret == -1)
14737 return -1;
14738
14739 if (ret != 1) {
14740 ret = unicode_format_arg_output(ctx, &arg, str);
14741 Py_DECREF(str);
14742 if (ret == -1)
14743 return -1;
14744 }
14745
14746 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14747 PyErr_SetString(PyExc_TypeError,
14748 "not all arguments converted during string formatting");
14749 return -1;
14750 }
14751 return 0;
14752}
14753
Alexander Belopolsky40018472011-02-26 01:02:56 +000014754PyObject *
14755PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014756{
Victor Stinnera47082312012-10-04 02:19:54 +020014757 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014758
Guido van Rossumd57fd912000-03-10 22:53:23 +000014759 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014760 PyErr_BadInternalCall();
14761 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014762 }
Victor Stinnera47082312012-10-04 02:19:54 +020014763
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014764 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014765 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014766
14767 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014768 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14769 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14770 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14771 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014772
Victor Stinner8f674cc2013-04-17 23:02:17 +020014773 _PyUnicodeWriter_Init(&ctx.writer);
14774 ctx.writer.min_length = ctx.fmtcnt + 100;
14775 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014776
Guido van Rossumd57fd912000-03-10 22:53:23 +000014777 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014778 ctx.arglen = PyTuple_Size(args);
14779 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014780 }
14781 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014782 ctx.arglen = -1;
14783 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014784 }
Victor Stinnera47082312012-10-04 02:19:54 +020014785 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014786 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014787 ctx.dict = args;
14788 else
14789 ctx.dict = NULL;
14790 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014791
Victor Stinnera47082312012-10-04 02:19:54 +020014792 while (--ctx.fmtcnt >= 0) {
14793 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014794 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014795
14796 nonfmtpos = ctx.fmtpos++;
14797 while (ctx.fmtcnt >= 0 &&
14798 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14799 ctx.fmtpos++;
14800 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014801 }
Victor Stinnera47082312012-10-04 02:19:54 +020014802 if (ctx.fmtcnt < 0) {
14803 ctx.fmtpos--;
14804 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014805 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014806
Victor Stinnercfc4c132013-04-03 01:48:39 +020014807 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14808 nonfmtpos, ctx.fmtpos) < 0)
14809 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014810 }
14811 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014812 ctx.fmtpos++;
14813 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014814 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014815 }
14816 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014817
Victor Stinnera47082312012-10-04 02:19:54 +020014818 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014819 PyErr_SetString(PyExc_TypeError,
14820 "not all arguments converted during string formatting");
14821 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014822 }
14823
Victor Stinnera47082312012-10-04 02:19:54 +020014824 if (ctx.args_owned) {
14825 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014826 }
Victor Stinnera47082312012-10-04 02:19:54 +020014827 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014828
Benjamin Peterson29060642009-01-31 22:14:21 +000014829 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014830 _PyUnicodeWriter_Dealloc(&ctx.writer);
14831 if (ctx.args_owned) {
14832 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014833 }
14834 return NULL;
14835}
14836
Jeremy Hylton938ace62002-07-17 16:30:39 +000014837static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014838unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14839
Tim Peters6d6c1a32001-08-02 04:15:00 +000014840static PyObject *
14841unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14842{
Benjamin Peterson29060642009-01-31 22:14:21 +000014843 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014844 static char *kwlist[] = {"object", "encoding", "errors", 0};
14845 char *encoding = NULL;
14846 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014847
Benjamin Peterson14339b62009-01-31 16:36:08 +000014848 if (type != &PyUnicode_Type)
14849 return unicode_subtype_new(type, args, kwds);
14850 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014851 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014852 return NULL;
14853 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014854 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014855 if (encoding == NULL && errors == NULL)
14856 return PyObject_Str(x);
14857 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014858 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014859}
14860
Guido van Rossume023fe02001-08-30 03:12:59 +000014861static PyObject *
14862unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14863{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014864 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014865 Py_ssize_t length, char_size;
14866 int share_wstr, share_utf8;
14867 unsigned int kind;
14868 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014869
Benjamin Peterson14339b62009-01-31 16:36:08 +000014870 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014871
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014872 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014873 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014874 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014875 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014876 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014877 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014878 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014879 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014880
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014881 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014882 if (self == NULL) {
14883 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014884 return NULL;
14885 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014886 kind = PyUnicode_KIND(unicode);
14887 length = PyUnicode_GET_LENGTH(unicode);
14888
14889 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014890#ifdef Py_DEBUG
14891 _PyUnicode_HASH(self) = -1;
14892#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014893 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014894#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014895 _PyUnicode_STATE(self).interned = 0;
14896 _PyUnicode_STATE(self).kind = kind;
14897 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014898 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014899 _PyUnicode_STATE(self).ready = 1;
14900 _PyUnicode_WSTR(self) = NULL;
14901 _PyUnicode_UTF8_LENGTH(self) = 0;
14902 _PyUnicode_UTF8(self) = NULL;
14903 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014904 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014905
14906 share_utf8 = 0;
14907 share_wstr = 0;
14908 if (kind == PyUnicode_1BYTE_KIND) {
14909 char_size = 1;
14910 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14911 share_utf8 = 1;
14912 }
14913 else if (kind == PyUnicode_2BYTE_KIND) {
14914 char_size = 2;
14915 if (sizeof(wchar_t) == 2)
14916 share_wstr = 1;
14917 }
14918 else {
14919 assert(kind == PyUnicode_4BYTE_KIND);
14920 char_size = 4;
14921 if (sizeof(wchar_t) == 4)
14922 share_wstr = 1;
14923 }
14924
14925 /* Ensure we won't overflow the length. */
14926 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14927 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014928 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014929 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014930 data = PyObject_MALLOC((length + 1) * char_size);
14931 if (data == NULL) {
14932 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014933 goto onError;
14934 }
14935
Victor Stinnerc3c74152011-10-02 20:39:55 +020014936 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014937 if (share_utf8) {
14938 _PyUnicode_UTF8_LENGTH(self) = length;
14939 _PyUnicode_UTF8(self) = data;
14940 }
14941 if (share_wstr) {
14942 _PyUnicode_WSTR_LENGTH(self) = length;
14943 _PyUnicode_WSTR(self) = (wchar_t *)data;
14944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014945
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014946 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014947 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014948 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014949#ifdef Py_DEBUG
14950 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14951#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014952 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014953 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014954
14955onError:
14956 Py_DECREF(unicode);
14957 Py_DECREF(self);
14958 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014959}
14960
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014961PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014962"str(object='') -> str\n\
14963str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014964\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014965Create a new string object from the given object. If encoding or\n\
14966errors is specified, then the object must expose a data buffer\n\
14967that will be decoded using the given encoding and error handler.\n\
14968Otherwise, returns the result of object.__str__() (if defined)\n\
14969or repr(object).\n\
14970encoding defaults to sys.getdefaultencoding().\n\
14971errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014972
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014973static PyObject *unicode_iter(PyObject *seq);
14974
Guido van Rossumd57fd912000-03-10 22:53:23 +000014975PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014976 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014977 "str", /* tp_name */
14978 sizeof(PyUnicodeObject), /* tp_size */
14979 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014980 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014981 (destructor)unicode_dealloc, /* tp_dealloc */
14982 0, /* tp_print */
14983 0, /* tp_getattr */
14984 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014985 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014986 unicode_repr, /* tp_repr */
14987 &unicode_as_number, /* tp_as_number */
14988 &unicode_as_sequence, /* tp_as_sequence */
14989 &unicode_as_mapping, /* tp_as_mapping */
14990 (hashfunc) unicode_hash, /* tp_hash*/
14991 0, /* tp_call*/
14992 (reprfunc) unicode_str, /* tp_str */
14993 PyObject_GenericGetAttr, /* tp_getattro */
14994 0, /* tp_setattro */
14995 0, /* tp_as_buffer */
14996 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014997 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014998 unicode_doc, /* tp_doc */
14999 0, /* tp_traverse */
15000 0, /* tp_clear */
15001 PyUnicode_RichCompare, /* tp_richcompare */
15002 0, /* tp_weaklistoffset */
15003 unicode_iter, /* tp_iter */
15004 0, /* tp_iternext */
15005 unicode_methods, /* tp_methods */
15006 0, /* tp_members */
15007 0, /* tp_getset */
15008 &PyBaseObject_Type, /* tp_base */
15009 0, /* tp_dict */
15010 0, /* tp_descr_get */
15011 0, /* tp_descr_set */
15012 0, /* tp_dictoffset */
15013 0, /* tp_init */
15014 0, /* tp_alloc */
15015 unicode_new, /* tp_new */
15016 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015017};
15018
15019/* Initialize the Unicode implementation */
15020
Victor Stinner3a50e702011-10-18 21:21:00 +020015021int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015022{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015023 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015024 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015025 0x000A, /* LINE FEED */
15026 0x000D, /* CARRIAGE RETURN */
15027 0x001C, /* FILE SEPARATOR */
15028 0x001D, /* GROUP SEPARATOR */
15029 0x001E, /* RECORD SEPARATOR */
15030 0x0085, /* NEXT LINE */
15031 0x2028, /* LINE SEPARATOR */
15032 0x2029, /* PARAGRAPH SEPARATOR */
15033 };
15034
Fred Drakee4315f52000-05-09 19:53:39 +000015035 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015036 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015037 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015038 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015039 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015040
Guido van Rossumcacfc072002-05-24 19:01:59 +000015041 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015042 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015043
15044 /* initialize the linebreak bloom filter */
15045 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015046 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015047 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015048
Christian Heimes26532f72013-07-20 14:57:16 +020015049 if (PyType_Ready(&EncodingMapType) < 0)
15050 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015051
Benjamin Petersonc4311282012-10-30 23:21:10 -040015052 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15053 Py_FatalError("Can't initialize field name iterator type");
15054
15055 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15056 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015057
Victor Stinner3a50e702011-10-18 21:21:00 +020015058 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015059}
15060
15061/* Finalize the Unicode implementation */
15062
Christian Heimesa156e092008-02-16 07:38:31 +000015063int
15064PyUnicode_ClearFreeList(void)
15065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015066 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015067}
15068
Guido van Rossumd57fd912000-03-10 22:53:23 +000015069void
Thomas Wouters78890102000-07-22 19:25:51 +000015070_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015071{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015072 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015073
Serhiy Storchaka05997252013-01-26 12:14:02 +020015074 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015075
Serhiy Storchaka05997252013-01-26 12:14:02 +020015076 for (i = 0; i < 256; i++)
15077 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015078 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015079 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015080}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015081
Walter Dörwald16807132007-05-25 13:52:07 +000015082void
15083PyUnicode_InternInPlace(PyObject **p)
15084{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015085 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015086 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015087#ifdef Py_DEBUG
15088 assert(s != NULL);
15089 assert(_PyUnicode_CHECK(s));
15090#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015091 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015092 return;
15093#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015094 /* If it's a subclass, we don't really know what putting
15095 it in the interned dict might do. */
15096 if (!PyUnicode_CheckExact(s))
15097 return;
15098 if (PyUnicode_CHECK_INTERNED(s))
15099 return;
15100 if (interned == NULL) {
15101 interned = PyDict_New();
15102 if (interned == NULL) {
15103 PyErr_Clear(); /* Don't leave an exception */
15104 return;
15105 }
15106 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015107 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015108 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015109 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015110 if (t == NULL) {
15111 PyErr_Clear();
15112 return;
15113 }
15114 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015115 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015116 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015117 return;
15118 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015119 /* The two references in interned are not counted by refcnt.
15120 The deallocator will take care of this */
15121 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015122 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015123}
15124
15125void
15126PyUnicode_InternImmortal(PyObject **p)
15127{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015128 PyUnicode_InternInPlace(p);
15129 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015130 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015131 Py_INCREF(*p);
15132 }
Walter Dörwald16807132007-05-25 13:52:07 +000015133}
15134
15135PyObject *
15136PyUnicode_InternFromString(const char *cp)
15137{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015138 PyObject *s = PyUnicode_FromString(cp);
15139 if (s == NULL)
15140 return NULL;
15141 PyUnicode_InternInPlace(&s);
15142 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015143}
15144
Alexander Belopolsky40018472011-02-26 01:02:56 +000015145void
15146_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015147{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015148 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015149 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015150 Py_ssize_t i, n;
15151 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015152
Benjamin Peterson14339b62009-01-31 16:36:08 +000015153 if (interned == NULL || !PyDict_Check(interned))
15154 return;
15155 keys = PyDict_Keys(interned);
15156 if (keys == NULL || !PyList_Check(keys)) {
15157 PyErr_Clear();
15158 return;
15159 }
Walter Dörwald16807132007-05-25 13:52:07 +000015160
Benjamin Peterson14339b62009-01-31 16:36:08 +000015161 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15162 detector, interned unicode strings are not forcibly deallocated;
15163 rather, we give them their stolen references back, and then clear
15164 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015165
Benjamin Peterson14339b62009-01-31 16:36:08 +000015166 n = PyList_GET_SIZE(keys);
15167 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015168 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015169 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015170 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015171 if (PyUnicode_READY(s) == -1) {
15172 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015173 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015174 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015175 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015176 case SSTATE_NOT_INTERNED:
15177 /* XXX Shouldn't happen */
15178 break;
15179 case SSTATE_INTERNED_IMMORTAL:
15180 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015181 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015182 break;
15183 case SSTATE_INTERNED_MORTAL:
15184 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015185 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015186 break;
15187 default:
15188 Py_FatalError("Inconsistent interned string state.");
15189 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015190 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015191 }
15192 fprintf(stderr, "total size of all interned strings: "
15193 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15194 "mortal/immortal\n", mortal_size, immortal_size);
15195 Py_DECREF(keys);
15196 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015197 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015198}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015199
15200
15201/********************* Unicode Iterator **************************/
15202
15203typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015204 PyObject_HEAD
15205 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015206 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015207} unicodeiterobject;
15208
15209static void
15210unicodeiter_dealloc(unicodeiterobject *it)
15211{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015212 _PyObject_GC_UNTRACK(it);
15213 Py_XDECREF(it->it_seq);
15214 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015215}
15216
15217static int
15218unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15219{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015220 Py_VISIT(it->it_seq);
15221 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015222}
15223
15224static PyObject *
15225unicodeiter_next(unicodeiterobject *it)
15226{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015227 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015228
Benjamin Peterson14339b62009-01-31 16:36:08 +000015229 assert(it != NULL);
15230 seq = it->it_seq;
15231 if (seq == NULL)
15232 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015233 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015235 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15236 int kind = PyUnicode_KIND(seq);
15237 void *data = PyUnicode_DATA(seq);
15238 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15239 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015240 if (item != NULL)
15241 ++it->it_index;
15242 return item;
15243 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015244
Benjamin Peterson14339b62009-01-31 16:36:08 +000015245 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015246 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015247 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015248}
15249
15250static PyObject *
15251unicodeiter_len(unicodeiterobject *it)
15252{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015253 Py_ssize_t len = 0;
15254 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015255 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015256 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015257}
15258
15259PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15260
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015261static PyObject *
15262unicodeiter_reduce(unicodeiterobject *it)
15263{
15264 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015265 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015266 it->it_seq, it->it_index);
15267 } else {
15268 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15269 if (u == NULL)
15270 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015271 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015272 }
15273}
15274
15275PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15276
15277static PyObject *
15278unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15279{
15280 Py_ssize_t index = PyLong_AsSsize_t(state);
15281 if (index == -1 && PyErr_Occurred())
15282 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015283 if (it->it_seq != NULL) {
15284 if (index < 0)
15285 index = 0;
15286 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15287 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15288 it->it_index = index;
15289 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015290 Py_RETURN_NONE;
15291}
15292
15293PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15294
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015295static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015296 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015297 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015298 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15299 reduce_doc},
15300 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15301 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015302 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015303};
15304
15305PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015306 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15307 "str_iterator", /* tp_name */
15308 sizeof(unicodeiterobject), /* tp_basicsize */
15309 0, /* tp_itemsize */
15310 /* methods */
15311 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15312 0, /* tp_print */
15313 0, /* tp_getattr */
15314 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015315 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015316 0, /* tp_repr */
15317 0, /* tp_as_number */
15318 0, /* tp_as_sequence */
15319 0, /* tp_as_mapping */
15320 0, /* tp_hash */
15321 0, /* tp_call */
15322 0, /* tp_str */
15323 PyObject_GenericGetAttr, /* tp_getattro */
15324 0, /* tp_setattro */
15325 0, /* tp_as_buffer */
15326 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15327 0, /* tp_doc */
15328 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15329 0, /* tp_clear */
15330 0, /* tp_richcompare */
15331 0, /* tp_weaklistoffset */
15332 PyObject_SelfIter, /* tp_iter */
15333 (iternextfunc)unicodeiter_next, /* tp_iternext */
15334 unicodeiter_methods, /* tp_methods */
15335 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015336};
15337
15338static PyObject *
15339unicode_iter(PyObject *seq)
15340{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015341 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015342
Benjamin Peterson14339b62009-01-31 16:36:08 +000015343 if (!PyUnicode_Check(seq)) {
15344 PyErr_BadInternalCall();
15345 return NULL;
15346 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015347 if (PyUnicode_READY(seq) == -1)
15348 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015349 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15350 if (it == NULL)
15351 return NULL;
15352 it->it_index = 0;
15353 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015354 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015355 _PyObject_GC_TRACK(it);
15356 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015357}
15358
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015359
15360size_t
15361Py_UNICODE_strlen(const Py_UNICODE *u)
15362{
15363 int res = 0;
15364 while(*u++)
15365 res++;
15366 return res;
15367}
15368
15369Py_UNICODE*
15370Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15371{
15372 Py_UNICODE *u = s1;
15373 while ((*u++ = *s2++));
15374 return s1;
15375}
15376
15377Py_UNICODE*
15378Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15379{
15380 Py_UNICODE *u = s1;
15381 while ((*u++ = *s2++))
15382 if (n-- == 0)
15383 break;
15384 return s1;
15385}
15386
15387Py_UNICODE*
15388Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15389{
15390 Py_UNICODE *u1 = s1;
15391 u1 += Py_UNICODE_strlen(u1);
15392 Py_UNICODE_strcpy(u1, s2);
15393 return s1;
15394}
15395
15396int
15397Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15398{
15399 while (*s1 && *s2 && *s1 == *s2)
15400 s1++, s2++;
15401 if (*s1 && *s2)
15402 return (*s1 < *s2) ? -1 : +1;
15403 if (*s1)
15404 return 1;
15405 if (*s2)
15406 return -1;
15407 return 0;
15408}
15409
15410int
15411Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15412{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015413 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015414 for (; n != 0; n--) {
15415 u1 = *s1;
15416 u2 = *s2;
15417 if (u1 != u2)
15418 return (u1 < u2) ? -1 : +1;
15419 if (u1 == '\0')
15420 return 0;
15421 s1++;
15422 s2++;
15423 }
15424 return 0;
15425}
15426
15427Py_UNICODE*
15428Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15429{
15430 const Py_UNICODE *p;
15431 for (p = s; *p; p++)
15432 if (*p == c)
15433 return (Py_UNICODE*)p;
15434 return NULL;
15435}
15436
15437Py_UNICODE*
15438Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15439{
15440 const Py_UNICODE *p;
15441 p = s + Py_UNICODE_strlen(s);
15442 while (p != s) {
15443 p--;
15444 if (*p == c)
15445 return (Py_UNICODE*)p;
15446 }
15447 return NULL;
15448}
Victor Stinner331ea922010-08-10 16:37:20 +000015449
Victor Stinner71133ff2010-09-01 23:43:53 +000015450Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015451PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015452{
Victor Stinner577db2c2011-10-11 22:12:48 +020015453 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015454 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015456 if (!PyUnicode_Check(unicode)) {
15457 PyErr_BadArgument();
15458 return NULL;
15459 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015460 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015461 if (u == NULL)
15462 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015463 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015464 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015465 PyErr_NoMemory();
15466 return NULL;
15467 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015468 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015469 size *= sizeof(Py_UNICODE);
15470 copy = PyMem_Malloc(size);
15471 if (copy == NULL) {
15472 PyErr_NoMemory();
15473 return NULL;
15474 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015475 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015476 return copy;
15477}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015478
Georg Brandl66c221e2010-10-14 07:04:07 +000015479/* A _string module, to export formatter_parser and formatter_field_name_split
15480 to the string.Formatter class implemented in Python. */
15481
15482static PyMethodDef _string_methods[] = {
15483 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15484 METH_O, PyDoc_STR("split the argument as a field name")},
15485 {"formatter_parser", (PyCFunction) formatter_parser,
15486 METH_O, PyDoc_STR("parse the argument as a format string")},
15487 {NULL, NULL}
15488};
15489
15490static struct PyModuleDef _string_module = {
15491 PyModuleDef_HEAD_INIT,
15492 "_string",
15493 PyDoc_STR("string helper module"),
15494 0,
15495 _string_methods,
15496 NULL,
15497 NULL,
15498 NULL,
15499 NULL
15500};
15501
15502PyMODINIT_FUNC
15503PyInit__string(void)
15504{
15505 return PyModule_Create(&_string_module);
15506}
15507
15508
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015509#ifdef __cplusplus
15510}
15511#endif