blob: 50b21cf9e65822ce3dd31ad51f1fdd30c0cb87ac [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700207static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200321 }
322 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200323 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 }
325 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200326 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200327 }
328 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200329 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200330 }
331 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200332 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200333 }
334 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200335 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 }
337 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200338 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200339 }
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_OTHER;
341}
342
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300343/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000345Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000346PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000348#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 /* This is actually an illegal character, so it should
352 not be passed to unichr. */
353 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354#endif
355}
356
Victor Stinner910337b2011-10-03 03:20:16 +0200357#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200358int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100359_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200360{
361 PyASCIIObject *ascii;
362 unsigned int kind;
363
364 assert(PyUnicode_Check(op));
365
366 ascii = (PyASCIIObject *)op;
367 kind = ascii->state.kind;
368
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200370 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 assert(ascii->state.ready == 1);
372 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200375 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200376
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 if (ascii->state.compact == 1) {
378 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200379 assert(kind == PyUnicode_1BYTE_KIND
380 || kind == PyUnicode_2BYTE_KIND
381 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200384 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100385 }
386 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389 data = unicode->data.any;
390 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100391 assert(ascii->length == 0);
392 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 assert(ascii->state.compact == 0);
394 assert(ascii->state.ascii == 0);
395 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200397 assert(ascii->wstr != NULL);
398 assert(data == NULL);
399 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 }
401 else {
402 assert(kind == PyUnicode_1BYTE_KIND
403 || kind == PyUnicode_2BYTE_KIND
404 || kind == PyUnicode_4BYTE_KIND);
405 assert(ascii->state.compact == 0);
406 assert(ascii->state.ready == 1);
407 assert(data != NULL);
408 if (ascii->state.ascii) {
409 assert (compact->utf8 == data);
410 assert (compact->utf8_length == ascii->length);
411 }
412 else
413 assert (compact->utf8 != data);
414 }
415 }
416 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200417 if (
418#if SIZEOF_WCHAR_T == 2
419 kind == PyUnicode_2BYTE_KIND
420#else
421 kind == PyUnicode_4BYTE_KIND
422#endif
423 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 {
425 assert(ascii->wstr == data);
426 assert(compact->wstr_length == ascii->length);
427 } else
428 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200429 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200430
431 if (compact->utf8 == NULL)
432 assert(compact->utf8_length == 0);
433 if (ascii->wstr == NULL)
434 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200435 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200436 /* check that the best kind is used */
437 if (check_content && kind != PyUnicode_WCHAR_KIND)
438 {
439 Py_ssize_t i;
440 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200441 void *data;
442 Py_UCS4 ch;
443
444 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 for (i=0; i < ascii->length; i++)
446 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200447 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200448 if (ch > maxchar)
449 maxchar = ch;
450 }
451 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100452 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 255);
455 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200456 else
457 assert(maxchar < 128);
458 }
Victor Stinner77faf692011-11-20 18:56:05 +0100459 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200460 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100461 assert(maxchar <= 0xFFFF);
462 }
463 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100465 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200467 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200468 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400469 return 1;
470}
Victor Stinner910337b2011-10-03 03:20:16 +0200471#endif
472
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100473static PyObject*
474unicode_result_wchar(PyObject *unicode)
475{
476#ifndef Py_DEBUG
477 Py_ssize_t len;
478
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100479 len = _PyUnicode_WSTR_LENGTH(unicode);
480 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200482 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100483 }
484
485 if (len == 1) {
486 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100487 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489 Py_DECREF(unicode);
490 return latin1_char;
491 }
492 }
493
494 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200495 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 return NULL;
497 }
498#else
Victor Stinneraa771272012-10-04 02:32:58 +0200499 assert(Py_REFCNT(unicode) == 1);
500
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 /* don't make the result ready in debug mode to ensure that the caller
502 makes the string ready before using it */
503 assert(_PyUnicode_CheckConsistency(unicode, 1));
504#endif
505 return unicode;
506}
507
508static PyObject*
509unicode_result_ready(PyObject *unicode)
510{
511 Py_ssize_t length;
512
513 length = PyUnicode_GET_LENGTH(unicode);
514 if (length == 0) {
515 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100516 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200517 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 }
519 return unicode_empty;
520 }
521
522 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200523 void *data = PyUnicode_DATA(unicode);
524 int kind = PyUnicode_KIND(unicode);
525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 if (ch < 256) {
527 PyObject *latin1_char = unicode_latin1[ch];
528 if (latin1_char != NULL) {
529 if (unicode != latin1_char) {
530 Py_INCREF(latin1_char);
531 Py_DECREF(unicode);
532 }
533 return latin1_char;
534 }
535 else {
536 assert(_PyUnicode_CheckConsistency(unicode, 1));
537 Py_INCREF(unicode);
538 unicode_latin1[ch] = unicode;
539 return unicode;
540 }
541 }
542 }
543
544 assert(_PyUnicode_CheckConsistency(unicode, 1));
545 return unicode;
546}
547
548static PyObject*
549unicode_result(PyObject *unicode)
550{
551 assert(_PyUnicode_CHECK(unicode));
552 if (PyUnicode_IS_READY(unicode))
553 return unicode_result_ready(unicode);
554 else
555 return unicode_result_wchar(unicode);
556}
557
Victor Stinnerc4b49542011-12-11 22:44:26 +0100558static PyObject*
559unicode_result_unchanged(PyObject *unicode)
560{
561 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500562 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563 return NULL;
564 Py_INCREF(unicode);
565 return unicode;
566 }
567 else
568 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100569 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100570}
571
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573 ASCII, Latin1, UTF-8, etc. */
574static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200575backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577{
Victor Stinnerad771582015-10-09 12:38:53 +0200578 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200579 Py_UCS4 ch;
580 enum PyUnicode_Kind kind;
581 void *data;
582
583 assert(PyUnicode_IS_READY(unicode));
584 kind = PyUnicode_KIND(unicode);
585 data = PyUnicode_DATA(unicode);
586
587 size = 0;
588 /* determine replacement size */
589 for (i = collstart; i < collend; ++i) {
590 Py_ssize_t incr;
591
592 ch = PyUnicode_READ(kind, data, i);
593 if (ch < 0x100)
594 incr = 2+2;
595 else if (ch < 0x10000)
596 incr = 2+4;
597 else {
598 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200599 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200600 }
601 if (size > PY_SSIZE_T_MAX - incr) {
602 PyErr_SetString(PyExc_OverflowError,
603 "encoded result is too long for a Python string");
604 return NULL;
605 }
606 size += incr;
607 }
608
Victor Stinnerad771582015-10-09 12:38:53 +0200609 str = _PyBytesWriter_Prepare(writer, str, size);
610 if (str == NULL)
611 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200612
613 /* generate replacement */
614 for (i = collstart; i < collend; ++i) {
615 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200616 *str++ = '\\';
617 if (ch >= 0x00010000) {
618 *str++ = 'U';
619 *str++ = Py_hexdigits[(ch>>28)&0xf];
620 *str++ = Py_hexdigits[(ch>>24)&0xf];
621 *str++ = Py_hexdigits[(ch>>20)&0xf];
622 *str++ = Py_hexdigits[(ch>>16)&0xf];
623 *str++ = Py_hexdigits[(ch>>12)&0xf];
624 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 }
Victor Stinner797485e2015-10-09 03:17:30 +0200626 else if (ch >= 0x100) {
627 *str++ = 'u';
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
630 }
631 else
632 *str++ = 'x';
633 *str++ = Py_hexdigits[(ch>>4)&0xf];
634 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200635 }
636 return str;
637}
638
639/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640 ASCII, Latin1, UTF-8, etc. */
641static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200642xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644{
Victor Stinnerad771582015-10-09 12:38:53 +0200645 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 Py_UCS4 ch;
647 enum PyUnicode_Kind kind;
648 void *data;
649
650 assert(PyUnicode_IS_READY(unicode));
651 kind = PyUnicode_KIND(unicode);
652 data = PyUnicode_DATA(unicode);
653
654 size = 0;
655 /* determine replacement size */
656 for (i = collstart; i < collend; ++i) {
657 Py_ssize_t incr;
658
659 ch = PyUnicode_READ(kind, data, i);
660 if (ch < 10)
661 incr = 2+1+1;
662 else if (ch < 100)
663 incr = 2+2+1;
664 else if (ch < 1000)
665 incr = 2+3+1;
666 else if (ch < 10000)
667 incr = 2+4+1;
668 else if (ch < 100000)
669 incr = 2+5+1;
670 else if (ch < 1000000)
671 incr = 2+6+1;
672 else {
673 assert(ch <= MAX_UNICODE);
674 incr = 2+7+1;
675 }
676 if (size > PY_SSIZE_T_MAX - incr) {
677 PyErr_SetString(PyExc_OverflowError,
678 "encoded result is too long for a Python string");
679 return NULL;
680 }
681 size += incr;
682 }
683
Victor Stinnerad771582015-10-09 12:38:53 +0200684 str = _PyBytesWriter_Prepare(writer, str, size);
685 if (str == NULL)
686 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200687
688 /* generate replacement */
689 for (i = collstart; i < collend; ++i) {
690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691 }
692 return str;
693}
694
Thomas Wouters477c8d52006-05-27 19:21:47 +0000695/* --- Bloom Filters ----------------------------------------------------- */
696
697/* stuff to implement simple "bloom filters" for Unicode characters.
698 to keep things simple, we use a single bitmask, using the least 5
699 bits from each unicode characters as the bit index. */
700
701/* the linebreak mask is set up by Unicode_Init below */
702
Antoine Pitrouf068f942010-01-13 14:19:12 +0000703#if LONG_BIT >= 128
704#define BLOOM_WIDTH 128
705#elif LONG_BIT >= 64
706#define BLOOM_WIDTH 64
707#elif LONG_BIT >= 32
708#define BLOOM_WIDTH 32
709#else
710#error "LONG_BIT is smaller than 32"
711#endif
712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713#define BLOOM_MASK unsigned long
714
Serhiy Storchaka05997252013-01-26 12:14:02 +0200715static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000716
Antoine Pitrouf068f942010-01-13 14:19:12 +0000717#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718
Benjamin Peterson29060642009-01-31 22:14:21 +0000719#define BLOOM_LINEBREAK(ch) \
720 ((ch) < 128U ? ascii_linebreak[(ch)] : \
721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000722
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700723static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000725{
Victor Stinnera85af502013-04-09 21:53:54 +0200726#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
727 do { \
728 TYPE *data = (TYPE *)PTR; \
729 TYPE *end = data + LEN; \
730 Py_UCS4 ch; \
731 for (; data != end; data++) { \
732 ch = *data; \
733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734 } \
735 break; \
736 } while (0)
737
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738 /* calculate simple bloom-style bitmask for a given unicode string */
739
Antoine Pitrouf068f942010-01-13 14:19:12 +0000740 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741
742 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200743 switch (kind) {
744 case PyUnicode_1BYTE_KIND:
745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746 break;
747 case PyUnicode_2BYTE_KIND:
748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749 break;
750 case PyUnicode_4BYTE_KIND:
751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752 break;
753 default:
754 assert(0);
755 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000756 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200757
758#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759}
760
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300761static int
762ensure_unicode(PyObject *obj)
763{
764 if (!PyUnicode_Check(obj)) {
765 PyErr_Format(PyExc_TypeError,
766 "must be str, not %.100s",
767 Py_TYPE(obj)->tp_name);
768 return -1;
769 }
770 return PyUnicode_READY(obj);
771}
772
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773/* Compilation of templated routines */
774
775#include "stringlib/asciilib.h"
776#include "stringlib/fastsearch.h"
777#include "stringlib/partition.h"
778#include "stringlib/split.h"
779#include "stringlib/count.h"
780#include "stringlib/find.h"
781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs1lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300791#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs2lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs4lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200818#include "stringlib/unicodedefs.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100822#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824/* --- Unicode Object ----------------------------------------------------- */
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200827fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700829static inline Py_ssize_t
830findchar(const void *s, int kind,
831 Py_ssize_t size, Py_UCS4 ch,
832 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834 switch (kind) {
835 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200836 if ((Py_UCS1) ch != ch)
837 return -1;
838 if (direction > 0)
839 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
840 else
841 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200842 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200843 if ((Py_UCS2) ch != ch)
844 return -1;
845 if (direction > 0)
846 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
847 else
848 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200849 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200850 if (direction > 0)
851 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
852 else
853 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200854 default:
855 assert(0);
856 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858}
859
Victor Stinnerafffce42012-10-03 23:03:17 +0200860#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000861/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200862 earlier.
863
864 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
865 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
866 invalid character in Unicode 6.0. */
867static void
868unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
869{
870 int kind = PyUnicode_KIND(unicode);
871 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
872 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
873 if (length <= old_length)
874 return;
875 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
876}
877#endif
878
Victor Stinnerfe226c02011-10-03 03:52:20 +0200879static PyObject*
880resize_compact(PyObject *unicode, Py_ssize_t length)
881{
882 Py_ssize_t char_size;
883 Py_ssize_t struct_size;
884 Py_ssize_t new_size;
885 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100886 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200887#ifdef Py_DEBUG
888 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
889#endif
890
Victor Stinner79891572012-05-03 13:43:07 +0200891 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200892 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100893 assert(PyUnicode_IS_COMPACT(unicode));
894
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200895 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100896 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200897 struct_size = sizeof(PyASCIIObject);
898 else
899 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200900 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901
Victor Stinnerfe226c02011-10-03 03:52:20 +0200902 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
903 PyErr_NoMemory();
904 return NULL;
905 }
906 new_size = (struct_size + (length + 1) * char_size);
907
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200908 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
909 PyObject_DEL(_PyUnicode_UTF8(unicode));
910 _PyUnicode_UTF8(unicode) = NULL;
911 _PyUnicode_UTF8_LENGTH(unicode) = 0;
912 }
Victor Stinner84def372011-12-11 20:04:56 +0100913 _Py_DEC_REFTOTAL;
914 _Py_ForgetReference(unicode);
915
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300916 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100917 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100918 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200919 PyErr_NoMemory();
920 return NULL;
921 }
Victor Stinner84def372011-12-11 20:04:56 +0100922 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200923 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100924
Victor Stinnerfe226c02011-10-03 03:52:20 +0200925 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200926 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200927 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100928 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200929 _PyUnicode_WSTR_LENGTH(unicode) = length;
930 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100931 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
932 PyObject_DEL(_PyUnicode_WSTR(unicode));
933 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100934 if (!PyUnicode_IS_ASCII(unicode))
935 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100936 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200937#ifdef Py_DEBUG
938 unicode_fill_invalid(unicode, old_length);
939#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
941 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200942 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 return unicode;
944}
945
Alexander Belopolsky40018472011-02-26 01:02:56 +0000946static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200947resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948{
Victor Stinner95663112011-10-04 01:03:50 +0200949 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100950 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200952 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000953
Victor Stinnerfe226c02011-10-03 03:52:20 +0200954 if (PyUnicode_IS_READY(unicode)) {
955 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200956 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200958#ifdef Py_DEBUG
959 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
960#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961
962 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200963 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
965 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200966
967 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
968 PyErr_NoMemory();
969 return -1;
970 }
971 new_size = (length + 1) * char_size;
972
Victor Stinner7a9105a2011-12-12 00:13:42 +0100973 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
974 {
975 PyObject_DEL(_PyUnicode_UTF8(unicode));
976 _PyUnicode_UTF8(unicode) = NULL;
977 _PyUnicode_UTF8_LENGTH(unicode) = 0;
978 }
979
Victor Stinnerfe226c02011-10-03 03:52:20 +0200980 data = (PyObject *)PyObject_REALLOC(data, new_size);
981 if (data == NULL) {
982 PyErr_NoMemory();
983 return -1;
984 }
985 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200986 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200987 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200988 _PyUnicode_WSTR_LENGTH(unicode) = length;
989 }
990 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200991 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200992 _PyUnicode_UTF8_LENGTH(unicode) = length;
993 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 _PyUnicode_LENGTH(unicode) = length;
995 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 unicode_fill_invalid(unicode, old_length);
998#endif
Victor Stinner95663112011-10-04 01:03:50 +0200999 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001000 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 }
Victor Stinner95663112011-10-04 01:03:50 +02001004 assert(_PyUnicode_WSTR(unicode) != NULL);
1005
1006 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001007 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001008 PyErr_NoMemory();
1009 return -1;
1010 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001012 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001013 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001014 if (!wstr) {
1015 PyErr_NoMemory();
1016 return -1;
1017 }
1018 _PyUnicode_WSTR(unicode) = wstr;
1019 _PyUnicode_WSTR(unicode)[length] = 0;
1020 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001021 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022 return 0;
1023}
1024
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025static PyObject*
1026resize_copy(PyObject *unicode, Py_ssize_t length)
1027{
1028 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001031
Benjamin Petersonbac79492012-01-14 13:34:47 -05001032 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001033 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001034
1035 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1036 if (copy == NULL)
1037 return NULL;
1038
1039 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001040 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001042 }
1043 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001044 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001046 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 if (w == NULL)
1048 return NULL;
1049 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1050 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001051 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001052 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001053 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 }
1055}
1056
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001058 Ux0000 terminated; some code (e.g. new_identifier)
1059 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060
1061 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001062 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063
1064*/
1065
Alexander Belopolsky40018472011-02-26 01:02:56 +00001066static PyUnicodeObject *
1067_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001069 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001071
Thomas Wouters477c8d52006-05-27 19:21:47 +00001072 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073 if (length == 0 && unicode_empty != NULL) {
1074 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001075 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 }
1077
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001078 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001079 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001080 return (PyUnicodeObject *)PyErr_NoMemory();
1081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 if (length < 0) {
1083 PyErr_SetString(PyExc_SystemError,
1084 "Negative size passed to _PyUnicode_New");
1085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086 }
1087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1089 if (unicode == NULL)
1090 return NULL;
1091 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001092
1093 _PyUnicode_WSTR_LENGTH(unicode) = length;
1094 _PyUnicode_HASH(unicode) = -1;
1095 _PyUnicode_STATE(unicode).interned = 0;
1096 _PyUnicode_STATE(unicode).kind = 0;
1097 _PyUnicode_STATE(unicode).compact = 0;
1098 _PyUnicode_STATE(unicode).ready = 0;
1099 _PyUnicode_STATE(unicode).ascii = 0;
1100 _PyUnicode_DATA_ANY(unicode) = NULL;
1101 _PyUnicode_LENGTH(unicode) = 0;
1102 _PyUnicode_UTF8(unicode) = NULL;
1103 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1106 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001107 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001108 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001109 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001110 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111
Jeremy Hyltond8082792003-09-16 19:41:39 +00001112 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001113 * the caller fails before initializing str -- unicode_resize()
1114 * reads str[0], and the Keep-Alive optimization can keep memory
1115 * allocated for str alive across a call to unicode_dealloc(unicode).
1116 * We don't want unicode_resize to read uninitialized memory in
1117 * that case.
1118 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 _PyUnicode_WSTR(unicode)[0] = 0;
1120 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001121
Victor Stinner7931d9a2011-11-04 00:22:48 +01001122 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 return unicode;
1124}
1125
Victor Stinnerf42dc442011-10-02 23:33:16 +02001126static const char*
1127unicode_kind_name(PyObject *unicode)
1128{
Victor Stinner42dfd712011-10-03 14:41:45 +02001129 /* don't check consistency: unicode_kind_name() is called from
1130 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131 if (!PyUnicode_IS_COMPACT(unicode))
1132 {
1133 if (!PyUnicode_IS_READY(unicode))
1134 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001135 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001136 {
1137 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001138 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001139 return "legacy ascii";
1140 else
1141 return "legacy latin1";
1142 case PyUnicode_2BYTE_KIND:
1143 return "legacy UCS2";
1144 case PyUnicode_4BYTE_KIND:
1145 return "legacy UCS4";
1146 default:
1147 return "<legacy invalid kind>";
1148 }
1149 }
1150 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001151 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "ascii";
1155 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001156 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001157 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001158 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001159 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001160 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001161 default:
1162 return "<invalid compact kind>";
1163 }
1164}
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167/* Functions wrapping macros for use in debugger */
1168char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001169 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170}
1171
1172void *_PyUnicode_compact_data(void *unicode) {
1173 return _PyUnicode_COMPACT_DATA(unicode);
1174}
1175void *_PyUnicode_data(void *unicode){
1176 printf("obj %p\n", unicode);
1177 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1178 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1179 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1180 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1181 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1182 return PyUnicode_DATA(unicode);
1183}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001184
1185void
1186_PyUnicode_Dump(PyObject *op)
1187{
1188 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001189 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1190 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1191 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001192
Victor Stinnera849a4b2011-10-03 12:12:11 +02001193 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001194 {
1195 if (ascii->state.ascii)
1196 data = (ascii + 1);
1197 else
1198 data = (compact + 1);
1199 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001200 else
1201 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001202 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1203 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001204
Victor Stinnera849a4b2011-10-03 12:12:11 +02001205 if (ascii->wstr == data)
1206 printf("shared ");
1207 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001208
Victor Stinnera3b334d2011-10-03 13:53:37 +02001209 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001210 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001211 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1212 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001213 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1214 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001215 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001216 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001217}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218#endif
1219
1220PyObject *
1221PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1222{
1223 PyObject *obj;
1224 PyCompactUnicodeObject *unicode;
1225 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001226 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001227 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 Py_ssize_t char_size;
1229 Py_ssize_t struct_size;
1230
1231 /* Optimization for empty strings */
1232 if (size == 0 && unicode_empty != NULL) {
1233 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001234 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 }
1236
Victor Stinner9e9d6892011-10-04 01:02:02 +02001237 is_ascii = 0;
1238 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239 struct_size = sizeof(PyCompactUnicodeObject);
1240 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001241 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 char_size = 1;
1243 is_ascii = 1;
1244 struct_size = sizeof(PyASCIIObject);
1245 }
1246 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001247 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248 char_size = 1;
1249 }
1250 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001251 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 char_size = 2;
1253 if (sizeof(wchar_t) == 2)
1254 is_sharing = 1;
1255 }
1256 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001257 if (maxchar > MAX_UNICODE) {
1258 PyErr_SetString(PyExc_SystemError,
1259 "invalid maximum character passed to PyUnicode_New");
1260 return NULL;
1261 }
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 4;
1264 if (sizeof(wchar_t) == 4)
1265 is_sharing = 1;
1266 }
1267
1268 /* Ensure we won't overflow the size. */
1269 if (size < 0) {
1270 PyErr_SetString(PyExc_SystemError,
1271 "Negative size passed to PyUnicode_New");
1272 return NULL;
1273 }
1274 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1275 return PyErr_NoMemory();
1276
1277 /* Duplicated allocation code from _PyObject_New() instead of a call to
1278 * PyObject_New() so we are able to allocate space for the object and
1279 * it's data buffer.
1280 */
1281 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1282 if (obj == NULL)
1283 return PyErr_NoMemory();
1284 obj = PyObject_INIT(obj, &PyUnicode_Type);
1285 if (obj == NULL)
1286 return NULL;
1287
1288 unicode = (PyCompactUnicodeObject *)obj;
1289 if (is_ascii)
1290 data = ((PyASCIIObject*)obj) + 1;
1291 else
1292 data = unicode + 1;
1293 _PyUnicode_LENGTH(unicode) = size;
1294 _PyUnicode_HASH(unicode) = -1;
1295 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001296 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 _PyUnicode_STATE(unicode).compact = 1;
1298 _PyUnicode_STATE(unicode).ready = 1;
1299 _PyUnicode_STATE(unicode).ascii = is_ascii;
1300 if (is_ascii) {
1301 ((char*)data)[size] = 0;
1302 _PyUnicode_WSTR(unicode) = NULL;
1303 }
Victor Stinner8f825062012-04-27 13:55:39 +02001304 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 ((char*)data)[size] = 0;
1306 _PyUnicode_WSTR(unicode) = NULL;
1307 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001309 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 else {
1312 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001313 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001314 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001316 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 ((Py_UCS4*)data)[size] = 0;
1318 if (is_sharing) {
1319 _PyUnicode_WSTR_LENGTH(unicode) = size;
1320 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1321 }
1322 else {
1323 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1324 _PyUnicode_WSTR(unicode) = NULL;
1325 }
1326 }
Victor Stinner8f825062012-04-27 13:55:39 +02001327#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001328 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001329#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001330 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 return obj;
1332}
1333
1334#if SIZEOF_WCHAR_T == 2
1335/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1336 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001337 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338
1339 This function assumes that unicode can hold one more code point than wstr
1340 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001341static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001343 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344{
1345 const wchar_t *iter;
1346 Py_UCS4 *ucs4_out;
1347
Victor Stinner910337b2011-10-03 03:20:16 +02001348 assert(unicode != NULL);
1349 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1351 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1352
1353 for (iter = begin; iter < end; ) {
1354 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1355 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001356 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1357 && (iter+1) < end
1358 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 {
Victor Stinner551ac952011-11-29 22:58:13 +01001360 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 iter += 2;
1362 }
1363 else {
1364 *ucs4_out++ = *iter;
1365 iter++;
1366 }
1367 }
1368 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1369 _PyUnicode_GET_LENGTH(unicode)));
1370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371}
1372#endif
1373
Victor Stinnercd9950f2011-10-02 00:34:53 +02001374static int
Victor Stinner488fa492011-12-12 00:01:39 +01001375unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001376{
Victor Stinner488fa492011-12-12 00:01:39 +01001377 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001378 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001379 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001380 return -1;
1381 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001382 return 0;
1383}
1384
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001385static int
1386_copy_characters(PyObject *to, Py_ssize_t to_start,
1387 PyObject *from, Py_ssize_t from_start,
1388 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001390 unsigned int from_kind, to_kind;
1391 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392
Victor Stinneree4544c2012-05-09 22:24:08 +02001393 assert(0 <= how_many);
1394 assert(0 <= from_start);
1395 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001397 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001398 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399
Victor Stinnerd3f08822012-05-29 12:57:52 +02001400 assert(PyUnicode_Check(to));
1401 assert(PyUnicode_IS_READY(to));
1402 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1403
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001404 if (how_many == 0)
1405 return 0;
1406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001408 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001410 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411
Victor Stinnerf1852262012-06-16 16:38:26 +02001412#ifdef Py_DEBUG
1413 if (!check_maxchar
1414 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1415 {
1416 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1417 Py_UCS4 ch;
1418 Py_ssize_t i;
1419 for (i=0; i < how_many; i++) {
1420 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1421 assert(ch <= to_maxchar);
1422 }
1423 }
1424#endif
1425
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001426 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001427 if (check_maxchar
1428 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1429 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001430 /* Writing Latin-1 characters into an ASCII string requires to
1431 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001432 Py_UCS4 max_char;
1433 max_char = ucs1lib_find_max_char(from_data,
1434 (Py_UCS1*)from_data + how_many);
1435 if (max_char >= 128)
1436 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001437 }
Christian Heimesf051e432016-09-13 20:22:02 +02001438 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001439 (char*)from_data + from_kind * from_start,
1440 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001442 else if (from_kind == PyUnicode_1BYTE_KIND
1443 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001444 {
1445 _PyUnicode_CONVERT_BYTES(
1446 Py_UCS1, Py_UCS2,
1447 PyUnicode_1BYTE_DATA(from) + from_start,
1448 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1449 PyUnicode_2BYTE_DATA(to) + to_start
1450 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001451 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001452 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001453 && to_kind == PyUnicode_4BYTE_KIND)
1454 {
1455 _PyUnicode_CONVERT_BYTES(
1456 Py_UCS1, Py_UCS4,
1457 PyUnicode_1BYTE_DATA(from) + from_start,
1458 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1459 PyUnicode_4BYTE_DATA(to) + to_start
1460 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001461 }
1462 else if (from_kind == PyUnicode_2BYTE_KIND
1463 && to_kind == PyUnicode_4BYTE_KIND)
1464 {
1465 _PyUnicode_CONVERT_BYTES(
1466 Py_UCS2, Py_UCS4,
1467 PyUnicode_2BYTE_DATA(from) + from_start,
1468 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1469 PyUnicode_4BYTE_DATA(to) + to_start
1470 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001471 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001472 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001473 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1474
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001475 if (!check_maxchar) {
1476 if (from_kind == PyUnicode_2BYTE_KIND
1477 && to_kind == PyUnicode_1BYTE_KIND)
1478 {
1479 _PyUnicode_CONVERT_BYTES(
1480 Py_UCS2, Py_UCS1,
1481 PyUnicode_2BYTE_DATA(from) + from_start,
1482 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1483 PyUnicode_1BYTE_DATA(to) + to_start
1484 );
1485 }
1486 else if (from_kind == PyUnicode_4BYTE_KIND
1487 && to_kind == PyUnicode_1BYTE_KIND)
1488 {
1489 _PyUnicode_CONVERT_BYTES(
1490 Py_UCS4, Py_UCS1,
1491 PyUnicode_4BYTE_DATA(from) + from_start,
1492 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1493 PyUnicode_1BYTE_DATA(to) + to_start
1494 );
1495 }
1496 else if (from_kind == PyUnicode_4BYTE_KIND
1497 && to_kind == PyUnicode_2BYTE_KIND)
1498 {
1499 _PyUnicode_CONVERT_BYTES(
1500 Py_UCS4, Py_UCS2,
1501 PyUnicode_4BYTE_DATA(from) + from_start,
1502 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1503 PyUnicode_2BYTE_DATA(to) + to_start
1504 );
1505 }
1506 else {
1507 assert(0);
1508 return -1;
1509 }
1510 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001511 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001513 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001514 Py_ssize_t i;
1515
Victor Stinnera0702ab2011-09-29 14:14:38 +02001516 for (i=0; i < how_many; i++) {
1517 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001518 if (ch > to_maxchar)
1519 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001520 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1521 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001522 }
1523 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524 return 0;
1525}
1526
Victor Stinnerd3f08822012-05-29 12:57:52 +02001527void
1528_PyUnicode_FastCopyCharacters(
1529 PyObject *to, Py_ssize_t to_start,
1530 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001531{
1532 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1533}
1534
1535Py_ssize_t
1536PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1537 PyObject *from, Py_ssize_t from_start,
1538 Py_ssize_t how_many)
1539{
1540 int err;
1541
1542 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1543 PyErr_BadInternalCall();
1544 return -1;
1545 }
1546
Benjamin Petersonbac79492012-01-14 13:34:47 -05001547 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001549 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001550 return -1;
1551
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001552 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001553 PyErr_SetString(PyExc_IndexError, "string index out of range");
1554 return -1;
1555 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001556 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001557 PyErr_SetString(PyExc_IndexError, "string index out of range");
1558 return -1;
1559 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001560 if (how_many < 0) {
1561 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1562 return -1;
1563 }
1564 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1566 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001567 "Cannot write %zi characters at %zi "
1568 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001569 how_many, to_start, PyUnicode_GET_LENGTH(to));
1570 return -1;
1571 }
1572
1573 if (how_many == 0)
1574 return 0;
1575
Victor Stinner488fa492011-12-12 00:01:39 +01001576 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001577 return -1;
1578
1579 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1580 if (err) {
1581 PyErr_Format(PyExc_SystemError,
1582 "Cannot copy %s characters "
1583 "into a string of %s characters",
1584 unicode_kind_name(from),
1585 unicode_kind_name(to));
1586 return -1;
1587 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001588 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589}
1590
Victor Stinner17222162011-09-28 22:15:37 +02001591/* Find the maximum code point and count the number of surrogate pairs so a
1592 correct string length can be computed before converting a string to UCS4.
1593 This function counts single surrogates as a character and not as a pair.
1594
1595 Return 0 on success, or -1 on error. */
1596static int
1597find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1598 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599{
1600 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001601 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602
Victor Stinnerc53be962011-10-02 21:33:54 +02001603 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604 *num_surrogates = 0;
1605 *maxchar = 0;
1606
1607 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001608#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001609 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1610 && (iter+1) < end
1611 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1612 {
1613 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1614 ++(*num_surrogates);
1615 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616 }
1617 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001618#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001619 {
1620 ch = *iter;
1621 iter++;
1622 }
1623 if (ch > *maxchar) {
1624 *maxchar = ch;
1625 if (*maxchar > MAX_UNICODE) {
1626 PyErr_Format(PyExc_ValueError,
1627 "character U+%x is not in range [U+0000; U+10ffff]",
1628 ch);
1629 return -1;
1630 }
1631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632 }
1633 return 0;
1634}
1635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001636int
1637_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638{
1639 wchar_t *end;
1640 Py_UCS4 maxchar = 0;
1641 Py_ssize_t num_surrogates;
1642#if SIZEOF_WCHAR_T == 2
1643 Py_ssize_t length_wo_surrogates;
1644#endif
1645
Georg Brandl7597add2011-10-05 16:36:47 +02001646 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001647 strings were created using _PyObject_New() and where no canonical
1648 representation (the str field) has been set yet aka strings
1649 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001650 assert(_PyUnicode_CHECK(unicode));
1651 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001653 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001654 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001655 /* Actually, it should neither be interned nor be anything else: */
1656 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001659 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001660 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662
1663 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001664 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1665 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 PyErr_NoMemory();
1667 return -1;
1668 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001669 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001670 _PyUnicode_WSTR(unicode), end,
1671 PyUnicode_1BYTE_DATA(unicode));
1672 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1673 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1674 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1675 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001676 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001677 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001678 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 }
1680 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001681 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001682 _PyUnicode_UTF8(unicode) = NULL;
1683 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 }
1685 PyObject_FREE(_PyUnicode_WSTR(unicode));
1686 _PyUnicode_WSTR(unicode) = NULL;
1687 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1688 }
1689 /* In this case we might have to convert down from 4-byte native
1690 wchar_t to 2-byte unicode. */
1691 else if (maxchar < 65536) {
1692 assert(num_surrogates == 0 &&
1693 "FindMaxCharAndNumSurrogatePairs() messed up");
1694
Victor Stinner506f5922011-09-28 22:34:18 +02001695#if SIZEOF_WCHAR_T == 2
1696 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001697 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001698 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1699 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1700 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001701 _PyUnicode_UTF8(unicode) = NULL;
1702 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001703#else
1704 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001705 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001706 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001707 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001708 PyErr_NoMemory();
1709 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 }
Victor Stinner506f5922011-09-28 22:34:18 +02001711 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1712 _PyUnicode_WSTR(unicode), end,
1713 PyUnicode_2BYTE_DATA(unicode));
1714 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1715 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1716 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001717 _PyUnicode_UTF8(unicode) = NULL;
1718 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001719 PyObject_FREE(_PyUnicode_WSTR(unicode));
1720 _PyUnicode_WSTR(unicode) = NULL;
1721 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1722#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 }
1724 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1725 else {
1726#if SIZEOF_WCHAR_T == 2
1727 /* in case the native representation is 2-bytes, we need to allocate a
1728 new normalized 4-byte version. */
1729 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001730 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1731 PyErr_NoMemory();
1732 return -1;
1733 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001734 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1735 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 PyErr_NoMemory();
1737 return -1;
1738 }
1739 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1740 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001741 _PyUnicode_UTF8(unicode) = NULL;
1742 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001743 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1744 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001745 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746 PyObject_FREE(_PyUnicode_WSTR(unicode));
1747 _PyUnicode_WSTR(unicode) = NULL;
1748 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1749#else
1750 assert(num_surrogates == 0);
1751
Victor Stinnerc3c74152011-10-02 20:39:55 +02001752 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001754 _PyUnicode_UTF8(unicode) = NULL;
1755 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1757#endif
1758 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1759 }
1760 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001761 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 return 0;
1763}
1764
Alexander Belopolsky40018472011-02-26 01:02:56 +00001765static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001766unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767{
Walter Dörwald16807132007-05-25 13:52:07 +00001768 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001769 case SSTATE_NOT_INTERNED:
1770 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001771
Benjamin Peterson29060642009-01-31 22:14:21 +00001772 case SSTATE_INTERNED_MORTAL:
1773 /* revive dead object temporarily for DelItem */
1774 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001775 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001776 Py_FatalError(
1777 "deletion of interned string failed");
1778 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001779
Benjamin Peterson29060642009-01-31 22:14:21 +00001780 case SSTATE_INTERNED_IMMORTAL:
1781 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001782
Benjamin Peterson29060642009-01-31 22:14:21 +00001783 default:
1784 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001785 }
1786
Victor Stinner03490912011-10-03 23:45:12 +02001787 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001789 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001790 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001791 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1792 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001794 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795}
1796
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001797#ifdef Py_DEBUG
1798static int
1799unicode_is_singleton(PyObject *unicode)
1800{
1801 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1802 if (unicode == unicode_empty)
1803 return 1;
1804 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1805 {
1806 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1807 if (ch < 256 && unicode_latin1[ch] == unicode)
1808 return 1;
1809 }
1810 return 0;
1811}
1812#endif
1813
Alexander Belopolsky40018472011-02-26 01:02:56 +00001814static int
Victor Stinner488fa492011-12-12 00:01:39 +01001815unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001816{
Victor Stinner488fa492011-12-12 00:01:39 +01001817 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001818 if (Py_REFCNT(unicode) != 1)
1819 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001820 if (_PyUnicode_HASH(unicode) != -1)
1821 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001822 if (PyUnicode_CHECK_INTERNED(unicode))
1823 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001824 if (!PyUnicode_CheckExact(unicode))
1825 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001826#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001827 /* singleton refcount is greater than 1 */
1828 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001829#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001830 return 1;
1831}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001832
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833static int
1834unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1835{
1836 PyObject *unicode;
1837 Py_ssize_t old_length;
1838
1839 assert(p_unicode != NULL);
1840 unicode = *p_unicode;
1841
1842 assert(unicode != NULL);
1843 assert(PyUnicode_Check(unicode));
1844 assert(0 <= length);
1845
Victor Stinner910337b2011-10-03 03:20:16 +02001846 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001847 old_length = PyUnicode_WSTR_LENGTH(unicode);
1848 else
1849 old_length = PyUnicode_GET_LENGTH(unicode);
1850 if (old_length == length)
1851 return 0;
1852
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001853 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001854 _Py_INCREF_UNICODE_EMPTY();
1855 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001856 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001857 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001858 return 0;
1859 }
1860
Victor Stinner488fa492011-12-12 00:01:39 +01001861 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 PyObject *copy = resize_copy(unicode, length);
1863 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001864 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001865 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001866 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001867 }
1868
Victor Stinnerfe226c02011-10-03 03:52:20 +02001869 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001870 PyObject *new_unicode = resize_compact(unicode, length);
1871 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001872 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001873 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001874 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001875 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001876 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001877}
1878
Alexander Belopolsky40018472011-02-26 01:02:56 +00001879int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001880PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001881{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001882 PyObject *unicode;
1883 if (p_unicode == NULL) {
1884 PyErr_BadInternalCall();
1885 return -1;
1886 }
1887 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001888 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 {
1890 PyErr_BadInternalCall();
1891 return -1;
1892 }
1893 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001894}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001895
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001896/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001897
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001898 WARNING: The function doesn't copy the terminating null character and
1899 doesn't check the maximum character (may write a latin1 character in an
1900 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001901static void
1902unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1903 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001904{
1905 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1906 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001907 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001908
1909 switch (kind) {
1910 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001911 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001912#ifdef Py_DEBUG
1913 if (PyUnicode_IS_ASCII(unicode)) {
1914 Py_UCS4 maxchar = ucs1lib_find_max_char(
1915 (const Py_UCS1*)str,
1916 (const Py_UCS1*)str + len);
1917 assert(maxchar < 128);
1918 }
1919#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001920 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001921 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001922 }
1923 case PyUnicode_2BYTE_KIND: {
1924 Py_UCS2 *start = (Py_UCS2 *)data + index;
1925 Py_UCS2 *ucs2 = start;
1926 assert(index <= PyUnicode_GET_LENGTH(unicode));
1927
Victor Stinner184252a2012-06-16 02:57:41 +02001928 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001929 *ucs2 = (Py_UCS2)*str;
1930
1931 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001932 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001933 }
1934 default: {
1935 Py_UCS4 *start = (Py_UCS4 *)data + index;
1936 Py_UCS4 *ucs4 = start;
1937 assert(kind == PyUnicode_4BYTE_KIND);
1938 assert(index <= PyUnicode_GET_LENGTH(unicode));
1939
Victor Stinner184252a2012-06-16 02:57:41 +02001940 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001941 *ucs4 = (Py_UCS4)*str;
1942
1943 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001944 }
1945 }
1946}
1947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948static PyObject*
1949get_latin1_char(unsigned char ch)
1950{
Victor Stinnera464fc12011-10-02 20:39:30 +02001951 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001953 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 if (!unicode)
1955 return NULL;
1956 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001957 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 unicode_latin1[ch] = unicode;
1959 }
1960 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001961 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001962}
1963
Victor Stinner985a82a2014-01-03 12:53:47 +01001964static PyObject*
1965unicode_char(Py_UCS4 ch)
1966{
1967 PyObject *unicode;
1968
1969 assert(ch <= MAX_UNICODE);
1970
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001971 if (ch < 256)
1972 return get_latin1_char(ch);
1973
Victor Stinner985a82a2014-01-03 12:53:47 +01001974 unicode = PyUnicode_New(1, ch);
1975 if (unicode == NULL)
1976 return NULL;
1977 switch (PyUnicode_KIND(unicode)) {
1978 case PyUnicode_1BYTE_KIND:
1979 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1980 break;
1981 case PyUnicode_2BYTE_KIND:
1982 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1983 break;
1984 default:
1985 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1986 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1987 }
1988 assert(_PyUnicode_CheckConsistency(unicode, 1));
1989 return unicode;
1990}
1991
Alexander Belopolsky40018472011-02-26 01:02:56 +00001992PyObject *
1993PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001995 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 Py_UCS4 maxchar = 0;
1997 Py_ssize_t num_surrogates;
1998
1999 if (u == NULL)
2000 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002002 /* If the Unicode data is known at construction time, we can apply
2003 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002006 if (size == 0)
2007 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 /* Single character Unicode objects in the Latin-1 range are
2010 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002011 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 return get_latin1_char((unsigned char)*u);
2013
2014 /* If not empty and not single character, copy the Unicode data
2015 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002016 if (find_maxchar_surrogates(u, u + size,
2017 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018 return NULL;
2019
Victor Stinner8faf8212011-12-08 22:14:11 +01002020 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 if (!unicode)
2022 return NULL;
2023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 switch (PyUnicode_KIND(unicode)) {
2025 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002026 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2028 break;
2029 case PyUnicode_2BYTE_KIND:
2030#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002031 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002033 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2035#endif
2036 break;
2037 case PyUnicode_4BYTE_KIND:
2038#if SIZEOF_WCHAR_T == 2
2039 /* This is the only case which has to process surrogates, thus
2040 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002041 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042#else
2043 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002044 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045#endif
2046 break;
2047 default:
2048 assert(0 && "Impossible state");
2049 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002051 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052}
2053
Alexander Belopolsky40018472011-02-26 01:02:56 +00002054PyObject *
2055PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002056{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002057 if (size < 0) {
2058 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002059 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002060 return NULL;
2061 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002062 if (u != NULL)
2063 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2064 else
2065 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002066}
2067
Alexander Belopolsky40018472011-02-26 01:02:56 +00002068PyObject *
2069PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002070{
2071 size_t size = strlen(u);
2072 if (size > PY_SSIZE_T_MAX) {
2073 PyErr_SetString(PyExc_OverflowError, "input too long");
2074 return NULL;
2075 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002076 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002077}
2078
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002079PyObject *
2080_PyUnicode_FromId(_Py_Identifier *id)
2081{
2082 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002083 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2084 strlen(id->string),
2085 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002086 if (!id->object)
2087 return NULL;
2088 PyUnicode_InternInPlace(&id->object);
2089 assert(!id->next);
2090 id->next = static_strings;
2091 static_strings = id;
2092 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002093 return id->object;
2094}
2095
2096void
2097_PyUnicode_ClearStaticStrings()
2098{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002099 _Py_Identifier *tmp, *s = static_strings;
2100 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002101 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002102 tmp = s->next;
2103 s->next = NULL;
2104 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002105 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002106 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002107}
2108
Benjamin Peterson0df54292012-03-26 14:50:32 -04002109/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002110
Victor Stinnerd3f08822012-05-29 12:57:52 +02002111PyObject*
2112_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002113{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002114 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002115 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002116 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002117#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002118 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002119#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002120 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002121 }
Victor Stinner785938e2011-12-11 20:09:03 +01002122 unicode = PyUnicode_New(size, 127);
2123 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002124 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002125 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2126 assert(_PyUnicode_CheckConsistency(unicode, 1));
2127 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002128}
2129
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002130static Py_UCS4
2131kind_maxchar_limit(unsigned int kind)
2132{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002133 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002134 case PyUnicode_1BYTE_KIND:
2135 return 0x80;
2136 case PyUnicode_2BYTE_KIND:
2137 return 0x100;
2138 case PyUnicode_4BYTE_KIND:
2139 return 0x10000;
2140 default:
2141 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002142 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002143 }
2144}
2145
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002146static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002147align_maxchar(Py_UCS4 maxchar)
2148{
2149 if (maxchar <= 127)
2150 return 127;
2151 else if (maxchar <= 255)
2152 return 255;
2153 else if (maxchar <= 65535)
2154 return 65535;
2155 else
2156 return MAX_UNICODE;
2157}
2158
Victor Stinner702c7342011-10-05 13:50:52 +02002159static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002160_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002163 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002164
Serhiy Storchaka678db842013-01-26 12:16:36 +02002165 if (size == 0)
2166 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002167 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002168 if (size == 1)
2169 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002170
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002171 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002172 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 if (!res)
2174 return NULL;
2175 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002176 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002178}
2179
Victor Stinnere57b1c02011-09-28 22:20:48 +02002180static PyObject*
2181_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182{
2183 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002184 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002185
Serhiy Storchaka678db842013-01-26 12:16:36 +02002186 if (size == 0)
2187 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002188 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002189 if (size == 1)
2190 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002191
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002192 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002193 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 if (!res)
2195 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002196 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002198 else {
2199 _PyUnicode_CONVERT_BYTES(
2200 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2201 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002202 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 return res;
2204}
2205
Victor Stinnere57b1c02011-09-28 22:20:48 +02002206static PyObject*
2207_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208{
2209 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002210 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002211
Serhiy Storchaka678db842013-01-26 12:16:36 +02002212 if (size == 0)
2213 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002214 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002215 if (size == 1)
2216 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002217
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002218 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002219 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 if (!res)
2221 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002222 if (max_char < 256)
2223 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2224 PyUnicode_1BYTE_DATA(res));
2225 else if (max_char < 0x10000)
2226 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2227 PyUnicode_2BYTE_DATA(res));
2228 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002230 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 return res;
2232}
2233
2234PyObject*
2235PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2236{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002237 if (size < 0) {
2238 PyErr_SetString(PyExc_ValueError, "size must be positive");
2239 return NULL;
2240 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002241 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002243 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002245 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002247 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002248 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002249 PyErr_SetString(PyExc_SystemError, "invalid kind");
2250 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252}
2253
Victor Stinnerece58de2012-04-23 23:36:38 +02002254Py_UCS4
2255_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2256{
2257 enum PyUnicode_Kind kind;
2258 void *startptr, *endptr;
2259
2260 assert(PyUnicode_IS_READY(unicode));
2261 assert(0 <= start);
2262 assert(end <= PyUnicode_GET_LENGTH(unicode));
2263 assert(start <= end);
2264
2265 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2266 return PyUnicode_MAX_CHAR_VALUE(unicode);
2267
2268 if (start == end)
2269 return 127;
2270
Victor Stinner94d558b2012-04-27 22:26:58 +02002271 if (PyUnicode_IS_ASCII(unicode))
2272 return 127;
2273
Victor Stinnerece58de2012-04-23 23:36:38 +02002274 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002275 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002276 endptr = (char *)startptr + end * kind;
2277 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002278 switch(kind) {
2279 case PyUnicode_1BYTE_KIND:
2280 return ucs1lib_find_max_char(startptr, endptr);
2281 case PyUnicode_2BYTE_KIND:
2282 return ucs2lib_find_max_char(startptr, endptr);
2283 case PyUnicode_4BYTE_KIND:
2284 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002285 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002286 assert(0);
2287 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002288 }
2289}
2290
Victor Stinner25a4b292011-10-06 12:31:55 +02002291/* Ensure that a string uses the most efficient storage, if it is not the
2292 case: create a new string with of the right kind. Write NULL into *p_unicode
2293 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002294static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002295unicode_adjust_maxchar(PyObject **p_unicode)
2296{
2297 PyObject *unicode, *copy;
2298 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002299 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002300 unsigned int kind;
2301
2302 assert(p_unicode != NULL);
2303 unicode = *p_unicode;
2304 assert(PyUnicode_IS_READY(unicode));
2305 if (PyUnicode_IS_ASCII(unicode))
2306 return;
2307
2308 len = PyUnicode_GET_LENGTH(unicode);
2309 kind = PyUnicode_KIND(unicode);
2310 if (kind == PyUnicode_1BYTE_KIND) {
2311 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002312 max_char = ucs1lib_find_max_char(u, u + len);
2313 if (max_char >= 128)
2314 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002315 }
2316 else if (kind == PyUnicode_2BYTE_KIND) {
2317 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002318 max_char = ucs2lib_find_max_char(u, u + len);
2319 if (max_char >= 256)
2320 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002321 }
2322 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002323 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002324 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002325 max_char = ucs4lib_find_max_char(u, u + len);
2326 if (max_char >= 0x10000)
2327 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002328 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002329 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002330 if (copy != NULL)
2331 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002332 Py_DECREF(unicode);
2333 *p_unicode = copy;
2334}
2335
Victor Stinner034f6cf2011-09-30 02:26:44 +02002336PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002337_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002338{
Victor Stinner87af4f22011-11-21 23:03:47 +01002339 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002340 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002341
Victor Stinner034f6cf2011-09-30 02:26:44 +02002342 if (!PyUnicode_Check(unicode)) {
2343 PyErr_BadInternalCall();
2344 return NULL;
2345 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002346 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002347 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002348
Victor Stinner87af4f22011-11-21 23:03:47 +01002349 length = PyUnicode_GET_LENGTH(unicode);
2350 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002351 if (!copy)
2352 return NULL;
2353 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2354
Christian Heimesf051e432016-09-13 20:22:02 +02002355 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002356 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002357 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002358 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002359}
2360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361
Victor Stinnerbc603d12011-10-02 01:00:40 +02002362/* Widen Unicode objects to larger buffers. Don't write terminating null
2363 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002364
2365void*
2366_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2367{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002368 Py_ssize_t len;
2369 void *result;
2370 unsigned int skind;
2371
Benjamin Petersonbac79492012-01-14 13:34:47 -05002372 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002373 return NULL;
2374
2375 len = PyUnicode_GET_LENGTH(s);
2376 skind = PyUnicode_KIND(s);
2377 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002378 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002379 return NULL;
2380 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002381 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002382 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002383 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002384 if (!result)
2385 return PyErr_NoMemory();
2386 assert(skind == PyUnicode_1BYTE_KIND);
2387 _PyUnicode_CONVERT_BYTES(
2388 Py_UCS1, Py_UCS2,
2389 PyUnicode_1BYTE_DATA(s),
2390 PyUnicode_1BYTE_DATA(s) + len,
2391 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002393 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002394 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002395 if (!result)
2396 return PyErr_NoMemory();
2397 if (skind == PyUnicode_2BYTE_KIND) {
2398 _PyUnicode_CONVERT_BYTES(
2399 Py_UCS2, Py_UCS4,
2400 PyUnicode_2BYTE_DATA(s),
2401 PyUnicode_2BYTE_DATA(s) + len,
2402 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002404 else {
2405 assert(skind == PyUnicode_1BYTE_KIND);
2406 _PyUnicode_CONVERT_BYTES(
2407 Py_UCS1, Py_UCS4,
2408 PyUnicode_1BYTE_DATA(s),
2409 PyUnicode_1BYTE_DATA(s) + len,
2410 result);
2411 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002413 default:
2414 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 }
Victor Stinner01698042011-10-04 00:04:26 +02002416 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 return NULL;
2418}
2419
2420static Py_UCS4*
2421as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2422 int copy_null)
2423{
2424 int kind;
2425 void *data;
2426 Py_ssize_t len, targetlen;
2427 if (PyUnicode_READY(string) == -1)
2428 return NULL;
2429 kind = PyUnicode_KIND(string);
2430 data = PyUnicode_DATA(string);
2431 len = PyUnicode_GET_LENGTH(string);
2432 targetlen = len;
2433 if (copy_null)
2434 targetlen++;
2435 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002436 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437 if (!target) {
2438 PyErr_NoMemory();
2439 return NULL;
2440 }
2441 }
2442 else {
2443 if (targetsize < targetlen) {
2444 PyErr_Format(PyExc_SystemError,
2445 "string is longer than the buffer");
2446 if (copy_null && 0 < targetsize)
2447 target[0] = 0;
2448 return NULL;
2449 }
2450 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002451 if (kind == PyUnicode_1BYTE_KIND) {
2452 Py_UCS1 *start = (Py_UCS1 *) data;
2453 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002455 else if (kind == PyUnicode_2BYTE_KIND) {
2456 Py_UCS2 *start = (Py_UCS2 *) data;
2457 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2458 }
2459 else {
2460 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002461 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 if (copy_null)
2464 target[len] = 0;
2465 return target;
2466}
2467
2468Py_UCS4*
2469PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2470 int copy_null)
2471{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002472 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 PyErr_BadInternalCall();
2474 return NULL;
2475 }
2476 return as_ucs4(string, target, targetsize, copy_null);
2477}
2478
2479Py_UCS4*
2480PyUnicode_AsUCS4Copy(PyObject *string)
2481{
2482 return as_ucs4(string, NULL, 0, 1);
2483}
2484
2485#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002486
Alexander Belopolsky40018472011-02-26 01:02:56 +00002487PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002488PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002492 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002493 PyErr_BadInternalCall();
2494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 }
2496
Martin v. Löwis790465f2008-04-05 20:41:37 +00002497 if (size == -1) {
2498 size = wcslen(w);
2499 }
2500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002501 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502}
2503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002505
Victor Stinner15a11362012-10-06 23:48:20 +02002506/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002507 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2508 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2509#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002510
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002511static int
2512unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2513 Py_ssize_t width, Py_ssize_t precision)
2514{
2515 Py_ssize_t length, fill, arglen;
2516 Py_UCS4 maxchar;
2517
2518 if (PyUnicode_READY(str) == -1)
2519 return -1;
2520
2521 length = PyUnicode_GET_LENGTH(str);
2522 if ((precision == -1 || precision >= length)
2523 && width <= length)
2524 return _PyUnicodeWriter_WriteStr(writer, str);
2525
2526 if (precision != -1)
2527 length = Py_MIN(precision, length);
2528
2529 arglen = Py_MAX(length, width);
2530 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2531 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2532 else
2533 maxchar = writer->maxchar;
2534
2535 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2536 return -1;
2537
2538 if (width > length) {
2539 fill = width - length;
2540 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2541 return -1;
2542 writer->pos += fill;
2543 }
2544
2545 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2546 str, 0, length);
2547 writer->pos += length;
2548 return 0;
2549}
2550
2551static int
2552unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2553 Py_ssize_t width, Py_ssize_t precision)
2554{
2555 /* UTF-8 */
2556 Py_ssize_t length;
2557 PyObject *unicode;
2558 int res;
2559
2560 length = strlen(str);
2561 if (precision != -1)
2562 length = Py_MIN(length, precision);
2563 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2564 if (unicode == NULL)
2565 return -1;
2566
2567 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2568 Py_DECREF(unicode);
2569 return res;
2570}
2571
Victor Stinner96865452011-03-01 23:44:09 +00002572static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002573unicode_fromformat_arg(_PyUnicodeWriter *writer,
2574 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002575{
Victor Stinnere215d962012-10-06 23:03:36 +02002576 const char *p;
2577 Py_ssize_t len;
2578 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002579 Py_ssize_t width;
2580 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002581 int longflag;
2582 int longlongflag;
2583 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002584 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002585
2586 p = f;
2587 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002588 zeropad = 0;
2589 if (*f == '0') {
2590 zeropad = 1;
2591 f++;
2592 }
Victor Stinner96865452011-03-01 23:44:09 +00002593
2594 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 width = -1;
2596 if (Py_ISDIGIT((unsigned)*f)) {
2597 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002598 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002599 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002600 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002601 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002602 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002603 return NULL;
2604 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002605 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002606 f++;
2607 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002608 }
2609 precision = -1;
2610 if (*f == '.') {
2611 f++;
2612 if (Py_ISDIGIT((unsigned)*f)) {
2613 precision = (*f - '0');
2614 f++;
2615 while (Py_ISDIGIT((unsigned)*f)) {
2616 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2617 PyErr_SetString(PyExc_ValueError,
2618 "precision too big");
2619 return NULL;
2620 }
2621 precision = (precision * 10) + (*f - '0');
2622 f++;
2623 }
2624 }
Victor Stinner96865452011-03-01 23:44:09 +00002625 if (*f == '%') {
2626 /* "%.3%s" => f points to "3" */
2627 f--;
2628 }
2629 }
2630 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002631 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002632 f--;
2633 }
Victor Stinner96865452011-03-01 23:44:09 +00002634
2635 /* Handle %ld, %lu, %lld and %llu. */
2636 longflag = 0;
2637 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002638 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002639 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002640 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002641 longflag = 1;
2642 ++f;
2643 }
Victor Stinner96865452011-03-01 23:44:09 +00002644 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002645 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002646 longlongflag = 1;
2647 f += 2;
2648 }
Victor Stinner96865452011-03-01 23:44:09 +00002649 }
2650 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002651 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002652 size_tflag = 1;
2653 ++f;
2654 }
Victor Stinnere215d962012-10-06 23:03:36 +02002655
2656 if (f[1] == '\0')
2657 writer->overallocate = 0;
2658
2659 switch (*f) {
2660 case 'c':
2661 {
2662 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002663 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002664 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002665 "character argument not in range(0x110000)");
2666 return NULL;
2667 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002668 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002669 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002670 break;
2671 }
2672
2673 case 'i':
2674 case 'd':
2675 case 'u':
2676 case 'x':
2677 {
2678 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002679 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002680 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002681
2682 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002683 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002684 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002685 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002686 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002687 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002688 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002689 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002690 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002691 va_arg(*vargs, size_t));
2692 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002693 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002694 va_arg(*vargs, unsigned int));
2695 }
2696 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002697 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002698 }
2699 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002700 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002701 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002702 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002703 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002704 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002705 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002706 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002707 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002708 va_arg(*vargs, Py_ssize_t));
2709 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002710 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002711 va_arg(*vargs, int));
2712 }
2713 assert(len >= 0);
2714
Victor Stinnere215d962012-10-06 23:03:36 +02002715 if (precision < len)
2716 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002717
2718 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002719 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2720 return NULL;
2721
Victor Stinnere215d962012-10-06 23:03:36 +02002722 if (width > precision) {
2723 Py_UCS4 fillchar;
2724 fill = width - precision;
2725 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002726 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2727 return NULL;
2728 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002729 }
Victor Stinner15a11362012-10-06 23:48:20 +02002730 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002731 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002732 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2733 return NULL;
2734 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002735 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002736
Victor Stinner4a587072013-11-19 12:54:53 +01002737 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2738 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002739 break;
2740 }
2741
2742 case 'p':
2743 {
2744 char number[MAX_LONG_LONG_CHARS];
2745
2746 len = sprintf(number, "%p", va_arg(*vargs, void*));
2747 assert(len >= 0);
2748
2749 /* %p is ill-defined: ensure leading 0x. */
2750 if (number[1] == 'X')
2751 number[1] = 'x';
2752 else if (number[1] != 'x') {
2753 memmove(number + 2, number,
2754 strlen(number) + 1);
2755 number[0] = '0';
2756 number[1] = 'x';
2757 len += 2;
2758 }
2759
Victor Stinner4a587072013-11-19 12:54:53 +01002760 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002761 return NULL;
2762 break;
2763 }
2764
2765 case 's':
2766 {
2767 /* UTF-8 */
2768 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002769 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002770 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002771 break;
2772 }
2773
2774 case 'U':
2775 {
2776 PyObject *obj = va_arg(*vargs, PyObject *);
2777 assert(obj && _PyUnicode_CHECK(obj));
2778
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002779 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002780 return NULL;
2781 break;
2782 }
2783
2784 case 'V':
2785 {
2786 PyObject *obj = va_arg(*vargs, PyObject *);
2787 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002788 if (obj) {
2789 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
2792 }
2793 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002794 assert(str != NULL);
2795 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002796 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002797 }
2798 break;
2799 }
2800
2801 case 'S':
2802 {
2803 PyObject *obj = va_arg(*vargs, PyObject *);
2804 PyObject *str;
2805 assert(obj);
2806 str = PyObject_Str(obj);
2807 if (!str)
2808 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002809 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002810 Py_DECREF(str);
2811 return NULL;
2812 }
2813 Py_DECREF(str);
2814 break;
2815 }
2816
2817 case 'R':
2818 {
2819 PyObject *obj = va_arg(*vargs, PyObject *);
2820 PyObject *repr;
2821 assert(obj);
2822 repr = PyObject_Repr(obj);
2823 if (!repr)
2824 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002825 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002826 Py_DECREF(repr);
2827 return NULL;
2828 }
2829 Py_DECREF(repr);
2830 break;
2831 }
2832
2833 case 'A':
2834 {
2835 PyObject *obj = va_arg(*vargs, PyObject *);
2836 PyObject *ascii;
2837 assert(obj);
2838 ascii = PyObject_ASCII(obj);
2839 if (!ascii)
2840 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002841 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002842 Py_DECREF(ascii);
2843 return NULL;
2844 }
2845 Py_DECREF(ascii);
2846 break;
2847 }
2848
2849 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002850 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002851 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002852 break;
2853
2854 default:
2855 /* if we stumble upon an unknown formatting code, copy the rest
2856 of the format string to the output string. (we cannot just
2857 skip the code, since there's no way to know what's in the
2858 argument list) */
2859 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002860 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002861 return NULL;
2862 f = p+len;
2863 return f;
2864 }
2865
2866 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002867 return f;
2868}
2869
Walter Dörwaldd2034312007-05-18 16:29:38 +00002870PyObject *
2871PyUnicode_FromFormatV(const char *format, va_list vargs)
2872{
Victor Stinnere215d962012-10-06 23:03:36 +02002873 va_list vargs2;
2874 const char *f;
2875 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002876
Victor Stinner8f674cc2013-04-17 23:02:17 +02002877 _PyUnicodeWriter_Init(&writer);
2878 writer.min_length = strlen(format) + 100;
2879 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002880
Benjamin Peterson0c212142016-09-20 20:39:33 -07002881 // Copy varags to be able to pass a reference to a subfunction.
2882 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002883
2884 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002885 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002886 f = unicode_fromformat_arg(&writer, f, &vargs2);
2887 if (f == NULL)
2888 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002890 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002891 const char *p;
2892 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002893
Victor Stinnere215d962012-10-06 23:03:36 +02002894 p = f;
2895 do
2896 {
2897 if ((unsigned char)*p > 127) {
2898 PyErr_Format(PyExc_ValueError,
2899 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2900 "string, got a non-ASCII byte: 0x%02x",
2901 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002902 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002903 }
2904 p++;
2905 }
2906 while (*p != '\0' && *p != '%');
2907 len = p - f;
2908
2909 if (*p == '\0')
2910 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002911
2912 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002913 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002914
2915 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002916 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002917 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002918 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002919 return _PyUnicodeWriter_Finish(&writer);
2920
2921 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002922 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002923 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002924 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002925}
2926
Walter Dörwaldd2034312007-05-18 16:29:38 +00002927PyObject *
2928PyUnicode_FromFormat(const char *format, ...)
2929{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002930 PyObject* ret;
2931 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002932
2933#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002934 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002935#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002936 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002937#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002938 ret = PyUnicode_FromFormatV(format, vargs);
2939 va_end(vargs);
2940 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002941}
2942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002943#ifdef HAVE_WCHAR_H
2944
Victor Stinner5593d8a2010-10-02 11:11:27 +00002945/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2946 convert a Unicode object to a wide character string.
2947
Victor Stinnerd88d9832011-09-06 02:00:05 +02002948 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002949 character) required to convert the unicode object. Ignore size argument.
2950
Victor Stinnerd88d9832011-09-06 02:00:05 +02002951 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002952 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002953 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002954static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002955unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002956 wchar_t *w,
2957 Py_ssize_t size)
2958{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002959 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002960 const wchar_t *wstr;
2961
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002962 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002963 if (wstr == NULL)
2964 return -1;
2965
Victor Stinner5593d8a2010-10-02 11:11:27 +00002966 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002967 if (size > res)
2968 size = res + 1;
2969 else
2970 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002971 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002972 return res;
2973 }
2974 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002975 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002976}
2977
2978Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002979PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002980 wchar_t *w,
2981 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982{
2983 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002984 PyErr_BadInternalCall();
2985 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002987 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988}
2989
Victor Stinner137c34c2010-09-29 10:25:54 +00002990wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002991PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002992 Py_ssize_t *size)
2993{
2994 wchar_t* buffer;
2995 Py_ssize_t buflen;
2996
2997 if (unicode == NULL) {
2998 PyErr_BadInternalCall();
2999 return NULL;
3000 }
3001
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003002 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003003 if (buflen == -1)
3004 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003005 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003006 if (buffer == NULL) {
3007 PyErr_NoMemory();
3008 return NULL;
3009 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003010 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003011 if (buflen == -1) {
3012 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003013 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003014 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003015 if (size != NULL)
3016 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003017 return buffer;
3018}
3019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003020#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021
Alexander Belopolsky40018472011-02-26 01:02:56 +00003022PyObject *
3023PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003024{
Victor Stinner8faf8212011-12-08 22:14:11 +01003025 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003026 PyErr_SetString(PyExc_ValueError,
3027 "chr() arg not in range(0x110000)");
3028 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003029 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003030
Victor Stinner985a82a2014-01-03 12:53:47 +01003031 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003032}
3033
Alexander Belopolsky40018472011-02-26 01:02:56 +00003034PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003035PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003037 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003038 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003039 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003040 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003041 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 Py_INCREF(obj);
3043 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003044 }
3045 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003046 /* For a Unicode subtype that's not a Unicode object,
3047 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003048 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003049 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003050 PyErr_Format(PyExc_TypeError,
3051 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003052 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003053 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003054}
3055
Alexander Belopolsky40018472011-02-26 01:02:56 +00003056PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003057PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003058 const char *encoding,
3059 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003060{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003061 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003062 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003063
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003065 PyErr_BadInternalCall();
3066 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003068
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003069 /* Decoding bytes objects is the most common case and should be fast */
3070 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003071 if (PyBytes_GET_SIZE(obj) == 0)
3072 _Py_RETURN_UNICODE_EMPTY();
3073 v = PyUnicode_Decode(
3074 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3075 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003076 return v;
3077 }
3078
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003079 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003080 PyErr_SetString(PyExc_TypeError,
3081 "decoding str is not supported");
3082 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003083 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003084
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003085 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3086 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3087 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003088 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003089 Py_TYPE(obj)->tp_name);
3090 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003091 }
Tim Petersced69f82003-09-16 20:30:58 +00003092
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003093 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003094 PyBuffer_Release(&buffer);
3095 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003097
Serhiy Storchaka05997252013-01-26 12:14:02 +02003098 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003099 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003100 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101}
3102
Victor Stinnerebe17e02016-10-12 13:57:45 +02003103/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3104 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3105 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003106int
3107_Py_normalize_encoding(const char *encoding,
3108 char *lower,
3109 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003111 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003112 char *l;
3113 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003114 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003115
Victor Stinner942889a2016-09-05 15:40:10 -07003116 assert(encoding != NULL);
3117
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003118 e = encoding;
3119 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003120 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003121 punct = 0;
3122 while (1) {
3123 char c = *e;
3124 if (c == 0) {
3125 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003126 }
Victor Stinner942889a2016-09-05 15:40:10 -07003127
3128 if (Py_ISALNUM(c) || c == '.') {
3129 if (punct && l != lower) {
3130 if (l == l_end) {
3131 return 0;
3132 }
3133 *l++ = '_';
3134 }
3135 punct = 0;
3136
3137 if (l == l_end) {
3138 return 0;
3139 }
3140 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003141 }
3142 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003143 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003144 }
Victor Stinner942889a2016-09-05 15:40:10 -07003145
3146 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003147 }
3148 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003149 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003150}
3151
Alexander Belopolsky40018472011-02-26 01:02:56 +00003152PyObject *
3153PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003154 Py_ssize_t size,
3155 const char *encoding,
3156 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003157{
3158 PyObject *buffer = NULL, *unicode;
3159 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003160 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3161
3162 if (encoding == NULL) {
3163 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3164 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003165
Fred Drakee4315f52000-05-09 19:53:39 +00003166 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003167 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3168 char *lower = buflower;
3169
3170 /* Fast paths */
3171 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3172 lower += 3;
3173 if (*lower == '_') {
3174 /* Match "utf8" and "utf_8" */
3175 lower++;
3176 }
3177
3178 if (lower[0] == '8' && lower[1] == 0) {
3179 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3180 }
3181 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3182 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3183 }
3184 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3185 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3186 }
3187 }
3188 else {
3189 if (strcmp(lower, "ascii") == 0
3190 || strcmp(lower, "us_ascii") == 0) {
3191 return PyUnicode_DecodeASCII(s, size, errors);
3192 }
Steve Dowercc16be82016-09-08 10:35:16 -07003193 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003194 else if (strcmp(lower, "mbcs") == 0) {
3195 return PyUnicode_DecodeMBCS(s, size, errors);
3196 }
3197 #endif
3198 else if (strcmp(lower, "latin1") == 0
3199 || strcmp(lower, "latin_1") == 0
3200 || strcmp(lower, "iso_8859_1") == 0
3201 || strcmp(lower, "iso8859_1") == 0) {
3202 return PyUnicode_DecodeLatin1(s, size, errors);
3203 }
3204 }
Victor Stinner37296e82010-06-10 13:36:23 +00003205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206
3207 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003208 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003209 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003210 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003211 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 if (buffer == NULL)
3213 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003214 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 if (unicode == NULL)
3216 goto onError;
3217 if (!PyUnicode_Check(unicode)) {
3218 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003219 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3220 "use codecs.decode() to decode to arbitrary types",
3221 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003222 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 Py_DECREF(unicode);
3224 goto onError;
3225 }
3226 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003227 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003228
Benjamin Peterson29060642009-01-31 22:14:21 +00003229 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003230 Py_XDECREF(buffer);
3231 return NULL;
3232}
3233
Alexander Belopolsky40018472011-02-26 01:02:56 +00003234PyObject *
3235PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003236 const char *encoding,
3237 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003238{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003239 if (!PyUnicode_Check(unicode)) {
3240 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003241 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003242 }
3243
Serhiy Storchaka00939072016-10-27 21:05:49 +03003244 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3245 "PyUnicode_AsDecodedObject() is deprecated; "
3246 "use PyCodec_Decode() to decode from str", 1) < 0)
3247 return NULL;
3248
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003249 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003251
3252 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003253 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003254}
3255
Alexander Belopolsky40018472011-02-26 01:02:56 +00003256PyObject *
3257PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003258 const char *encoding,
3259 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003260{
3261 PyObject *v;
3262
3263 if (!PyUnicode_Check(unicode)) {
3264 PyErr_BadArgument();
3265 goto onError;
3266 }
3267
Serhiy Storchaka00939072016-10-27 21:05:49 +03003268 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3269 "PyUnicode_AsDecodedUnicode() is deprecated; "
3270 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3271 return NULL;
3272
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003273 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003274 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003275
3276 /* Decode via the codec registry */
3277 v = PyCodec_Decode(unicode, encoding, errors);
3278 if (v == NULL)
3279 goto onError;
3280 if (!PyUnicode_Check(v)) {
3281 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003282 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3283 "use codecs.decode() to decode to arbitrary types",
3284 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003285 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003286 Py_DECREF(v);
3287 goto onError;
3288 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003289 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003290
Benjamin Peterson29060642009-01-31 22:14:21 +00003291 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003292 return NULL;
3293}
3294
Alexander Belopolsky40018472011-02-26 01:02:56 +00003295PyObject *
3296PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003297 Py_ssize_t size,
3298 const char *encoding,
3299 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300{
3301 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003302
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 unicode = PyUnicode_FromUnicode(s, size);
3304 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003305 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3307 Py_DECREF(unicode);
3308 return v;
3309}
3310
Alexander Belopolsky40018472011-02-26 01:02:56 +00003311PyObject *
3312PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003313 const char *encoding,
3314 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003315{
3316 PyObject *v;
3317
3318 if (!PyUnicode_Check(unicode)) {
3319 PyErr_BadArgument();
3320 goto onError;
3321 }
3322
Serhiy Storchaka00939072016-10-27 21:05:49 +03003323 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3324 "PyUnicode_AsEncodedObject() is deprecated; "
3325 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3326 "or PyCodec_Encode() for generic encoding", 1) < 0)
3327 return NULL;
3328
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003329 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003330 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003331
3332 /* Encode via the codec registry */
3333 v = PyCodec_Encode(unicode, encoding, errors);
3334 if (v == NULL)
3335 goto onError;
3336 return v;
3337
Benjamin Peterson29060642009-01-31 22:14:21 +00003338 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003339 return NULL;
3340}
3341
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003342static size_t
3343wcstombs_errorpos(const wchar_t *wstr)
3344{
3345 size_t len;
3346#if SIZEOF_WCHAR_T == 2
3347 wchar_t buf[3];
3348#else
3349 wchar_t buf[2];
3350#endif
3351 char outbuf[MB_LEN_MAX];
3352 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003353
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003354#if SIZEOF_WCHAR_T == 2
3355 buf[2] = 0;
3356#else
3357 buf[1] = 0;
3358#endif
3359 start = wstr;
3360 while (*wstr != L'\0')
3361 {
3362 previous = wstr;
3363#if SIZEOF_WCHAR_T == 2
3364 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3365 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3366 {
3367 buf[0] = wstr[0];
3368 buf[1] = wstr[1];
3369 wstr += 2;
3370 }
3371 else {
3372 buf[0] = *wstr;
3373 buf[1] = 0;
3374 wstr++;
3375 }
3376#else
3377 buf[0] = *wstr;
3378 wstr++;
3379#endif
3380 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003381 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003382 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003383 }
3384
3385 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003386 return 0;
3387}
3388
Victor Stinner1b579672011-12-17 05:47:23 +01003389static int
3390locale_error_handler(const char *errors, int *surrogateescape)
3391{
Victor Stinner50149202015-09-22 00:26:54 +02003392 _Py_error_handler error_handler = get_error_handler(errors);
3393 switch (error_handler)
3394 {
3395 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003396 *surrogateescape = 0;
3397 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003398 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003399 *surrogateescape = 1;
3400 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003401 default:
3402 PyErr_Format(PyExc_ValueError,
3403 "only 'strict' and 'surrogateescape' error handlers "
3404 "are supported, not '%s'",
3405 errors);
3406 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003407 }
Victor Stinner1b579672011-12-17 05:47:23 +01003408}
3409
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003410PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003411PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003412{
3413 Py_ssize_t wlen, wlen2;
3414 wchar_t *wstr;
3415 PyObject *bytes = NULL;
3416 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003417 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003418 PyObject *exc;
3419 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003420 int surrogateescape;
3421
3422 if (locale_error_handler(errors, &surrogateescape) < 0)
3423 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003424
3425 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3426 if (wstr == NULL)
3427 return NULL;
3428
3429 wlen2 = wcslen(wstr);
3430 if (wlen2 != wlen) {
3431 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003432 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003433 return NULL;
3434 }
3435
3436 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003437 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003438 char *str;
3439
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003440 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003441 if (str == NULL) {
3442 if (error_pos == (size_t)-1) {
3443 PyErr_NoMemory();
3444 PyMem_Free(wstr);
3445 return NULL;
3446 }
3447 else {
3448 goto encode_error;
3449 }
3450 }
3451 PyMem_Free(wstr);
3452
3453 bytes = PyBytes_FromString(str);
3454 PyMem_Free(str);
3455 }
3456 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003457 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003458 size_t len, len2;
3459
3460 len = wcstombs(NULL, wstr, 0);
3461 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003462 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003463 goto encode_error;
3464 }
3465
3466 bytes = PyBytes_FromStringAndSize(NULL, len);
3467 if (bytes == NULL) {
3468 PyMem_Free(wstr);
3469 return NULL;
3470 }
3471
3472 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3473 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003474 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003475 goto encode_error;
3476 }
3477 PyMem_Free(wstr);
3478 }
3479 return bytes;
3480
3481encode_error:
3482 errmsg = strerror(errno);
3483 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003484
3485 if (error_pos == (size_t)-1)
3486 error_pos = wcstombs_errorpos(wstr);
3487
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003488 PyMem_Free(wstr);
3489 Py_XDECREF(bytes);
3490
Victor Stinner2f197072011-12-17 07:08:30 +01003491 if (errmsg != NULL) {
3492 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003493 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003494 if (wstr != NULL) {
3495 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003496 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003497 } else
3498 errmsg = NULL;
3499 }
3500 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003501 reason = PyUnicode_FromString(
3502 "wcstombs() encountered an unencodable "
3503 "wide character");
3504 if (reason == NULL)
3505 return NULL;
3506
3507 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3508 "locale", unicode,
3509 (Py_ssize_t)error_pos,
3510 (Py_ssize_t)(error_pos+1),
3511 reason);
3512 Py_DECREF(reason);
3513 if (exc != NULL) {
3514 PyCodec_StrictErrors(exc);
3515 Py_XDECREF(exc);
3516 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003517 return NULL;
3518}
3519
Victor Stinnerad158722010-10-27 00:25:46 +00003520PyObject *
3521PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003522{
Steve Dowercc16be82016-09-08 10:35:16 -07003523#if defined(__APPLE__)
3524 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003525#else
Victor Stinner793b5312011-04-27 00:24:21 +02003526 PyInterpreterState *interp = PyThreadState_GET()->interp;
3527 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3528 cannot use it to encode and decode filenames before it is loaded. Load
3529 the Python codec requires to encode at least its own filename. Use the C
3530 version of the locale codec until the codec registry is initialized and
3531 the Python codec is loaded.
3532
3533 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3534 cannot only rely on it: check also interp->fscodec_initialized for
3535 subinterpreters. */
3536 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003537 return PyUnicode_AsEncodedString(unicode,
3538 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003539 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003540 }
3541 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003542 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003543 }
Victor Stinnerad158722010-10-27 00:25:46 +00003544#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003545}
3546
Alexander Belopolsky40018472011-02-26 01:02:56 +00003547PyObject *
3548PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003549 const char *encoding,
3550 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551{
3552 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003553 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003554
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 if (!PyUnicode_Check(unicode)) {
3556 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003557 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 }
Fred Drakee4315f52000-05-09 19:53:39 +00003559
Victor Stinner942889a2016-09-05 15:40:10 -07003560 if (encoding == NULL) {
3561 return _PyUnicode_AsUTF8String(unicode, errors);
3562 }
3563
Fred Drakee4315f52000-05-09 19:53:39 +00003564 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003565 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3566 char *lower = buflower;
3567
3568 /* Fast paths */
3569 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3570 lower += 3;
3571 if (*lower == '_') {
3572 /* Match "utf8" and "utf_8" */
3573 lower++;
3574 }
3575
3576 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003577 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003578 }
3579 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3580 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3581 }
3582 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3583 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3584 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003585 }
Victor Stinner942889a2016-09-05 15:40:10 -07003586 else {
3587 if (strcmp(lower, "ascii") == 0
3588 || strcmp(lower, "us_ascii") == 0) {
3589 return _PyUnicode_AsASCIIString(unicode, errors);
3590 }
Steve Dowercc16be82016-09-08 10:35:16 -07003591#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003592 else if (strcmp(lower, "mbcs") == 0) {
3593 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3594 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003595#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003596 else if (strcmp(lower, "latin1") == 0 ||
3597 strcmp(lower, "latin_1") == 0 ||
3598 strcmp(lower, "iso_8859_1") == 0 ||
3599 strcmp(lower, "iso8859_1") == 0) {
3600 return _PyUnicode_AsLatin1String(unicode, errors);
3601 }
3602 }
Victor Stinner37296e82010-06-10 13:36:23 +00003603 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604
3605 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003606 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003608 return NULL;
3609
3610 /* The normal path */
3611 if (PyBytes_Check(v))
3612 return v;
3613
3614 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003615 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003616 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003617 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003618
3619 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003620 "encoder %s returned bytearray instead of bytes; "
3621 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003622 encoding);
3623 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003624 Py_DECREF(v);
3625 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003626 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003627
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003628 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3629 Py_DECREF(v);
3630 return b;
3631 }
3632
3633 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003634 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3635 "use codecs.encode() to encode to arbitrary types",
3636 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003637 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003638 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003639 return NULL;
3640}
3641
Alexander Belopolsky40018472011-02-26 01:02:56 +00003642PyObject *
3643PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003644 const char *encoding,
3645 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003646{
3647 PyObject *v;
3648
3649 if (!PyUnicode_Check(unicode)) {
3650 PyErr_BadArgument();
3651 goto onError;
3652 }
3653
Serhiy Storchaka00939072016-10-27 21:05:49 +03003654 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3655 "PyUnicode_AsEncodedUnicode() is deprecated; "
3656 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3657 return NULL;
3658
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003659 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003661
3662 /* Encode via the codec registry */
3663 v = PyCodec_Encode(unicode, encoding, errors);
3664 if (v == NULL)
3665 goto onError;
3666 if (!PyUnicode_Check(v)) {
3667 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003668 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3669 "use codecs.encode() to encode to arbitrary types",
3670 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003671 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003672 Py_DECREF(v);
3673 goto onError;
3674 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003675 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003676
Benjamin Peterson29060642009-01-31 22:14:21 +00003677 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678 return NULL;
3679}
3680
Victor Stinner2f197072011-12-17 07:08:30 +01003681static size_t
3682mbstowcs_errorpos(const char *str, size_t len)
3683{
3684#ifdef HAVE_MBRTOWC
3685 const char *start = str;
3686 mbstate_t mbs;
3687 size_t converted;
3688 wchar_t ch;
3689
3690 memset(&mbs, 0, sizeof mbs);
3691 while (len)
3692 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003693 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003694 if (converted == 0)
3695 /* Reached end of string */
3696 break;
3697 if (converted == (size_t)-1 || converted == (size_t)-2) {
3698 /* Conversion error or incomplete character */
3699 return str - start;
3700 }
3701 else {
3702 str += converted;
3703 len -= converted;
3704 }
3705 }
3706 /* failed to find the undecodable byte sequence */
3707 return 0;
3708#endif
3709 return 0;
3710}
3711
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003712PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003713PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003714 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003715{
3716 wchar_t smallbuf[256];
3717 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3718 wchar_t *wstr;
3719 size_t wlen, wlen2;
3720 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003721 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003722 size_t error_pos;
3723 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003724 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3725 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003726
3727 if (locale_error_handler(errors, &surrogateescape) < 0)
3728 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003729
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003730 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3731 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003732 return NULL;
3733 }
3734
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003735 if (surrogateescape) {
3736 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003737 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003738 if (wstr == NULL) {
3739 if (wlen == (size_t)-1)
3740 PyErr_NoMemory();
3741 else
3742 PyErr_SetFromErrno(PyExc_OSError);
3743 return NULL;
3744 }
3745
3746 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003747 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003748 }
3749 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003750 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003751#ifndef HAVE_BROKEN_MBSTOWCS
3752 wlen = mbstowcs(NULL, str, 0);
3753#else
3754 wlen = len;
3755#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003756 if (wlen == (size_t)-1)
3757 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003758 if (wlen+1 <= smallbuf_len) {
3759 wstr = smallbuf;
3760 }
3761 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003762 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003763 if (!wstr)
3764 return PyErr_NoMemory();
3765 }
3766
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003767 wlen2 = mbstowcs(wstr, str, wlen+1);
3768 if (wlen2 == (size_t)-1) {
3769 if (wstr != smallbuf)
3770 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003771 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003772 }
3773#ifdef HAVE_BROKEN_MBSTOWCS
3774 assert(wlen2 == wlen);
3775#endif
3776 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3777 if (wstr != smallbuf)
3778 PyMem_Free(wstr);
3779 }
3780 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003781
3782decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003783 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003784 errmsg = strerror(errno);
3785 assert(errmsg != NULL);
3786
3787 error_pos = mbstowcs_errorpos(str, len);
3788 if (errmsg != NULL) {
3789 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003790 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003791 if (wstr != NULL) {
3792 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003793 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003794 }
Victor Stinner2f197072011-12-17 07:08:30 +01003795 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003796 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003797 reason = PyUnicode_FromString(
3798 "mbstowcs() encountered an invalid multibyte sequence");
3799 if (reason == NULL)
3800 return NULL;
3801
3802 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3803 "locale", str, len,
3804 (Py_ssize_t)error_pos,
3805 (Py_ssize_t)(error_pos+1),
3806 reason);
3807 Py_DECREF(reason);
3808 if (exc != NULL) {
3809 PyCodec_StrictErrors(exc);
3810 Py_XDECREF(exc);
3811 }
3812 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003813}
3814
3815PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003816PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003817{
3818 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003819 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003820}
3821
3822
3823PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003824PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003825 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003826 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3827}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003828
Christian Heimes5894ba72007-11-04 11:43:14 +00003829PyObject*
3830PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3831{
Steve Dowercc16be82016-09-08 10:35:16 -07003832#if defined(__APPLE__)
3833 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003834#else
Victor Stinner793b5312011-04-27 00:24:21 +02003835 PyInterpreterState *interp = PyThreadState_GET()->interp;
3836 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3837 cannot use it to encode and decode filenames before it is loaded. Load
3838 the Python codec requires to encode at least its own filename. Use the C
3839 version of the locale codec until the codec registry is initialized and
3840 the Python codec is loaded.
3841
3842 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3843 cannot only rely on it: check also interp->fscodec_initialized for
3844 subinterpreters. */
3845 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dowercc16be82016-09-08 10:35:16 -07003846 PyObject *res = PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003847 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003848 Py_FileSystemDefaultEncodeErrors);
3849#ifdef MS_WINDOWS
3850 if (!res && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
Serhiy Storchaka467ab192016-10-21 17:09:17 +03003851 _PyErr_FormatFromCause(PyExc_RuntimeError,
3852 "filesystem path bytes were not correctly encoded with '%s'. "
Steve Dowercc16be82016-09-08 10:35:16 -07003853 "Please report this at http://bugs.python.org/issue27781",
3854 Py_FileSystemDefaultEncoding);
Steve Dowercc16be82016-09-08 10:35:16 -07003855 }
3856#endif
3857 return res;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003858 }
3859 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003860 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003861 }
Victor Stinnerad158722010-10-27 00:25:46 +00003862#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003863}
3864
Martin v. Löwis011e8422009-05-05 04:43:17 +00003865
3866int
3867PyUnicode_FSConverter(PyObject* arg, void* addr)
3868{
Brett Cannonec6ce872016-09-06 15:50:29 -07003869 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003870 PyObject *output = NULL;
3871 Py_ssize_t size;
3872 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003873 if (arg == NULL) {
3874 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003875 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003876 return 1;
3877 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003878 path = PyOS_FSPath(arg);
3879 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003880 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003881 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003882 if (PyBytes_Check(path)) {
3883 output = path;
3884 }
3885 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3886 output = PyUnicode_EncodeFSDefault(path);
3887 Py_DECREF(path);
3888 if (!output) {
3889 return 0;
3890 }
3891 assert(PyBytes_Check(output));
3892 }
3893
Victor Stinner0ea2a462010-04-30 00:22:08 +00003894 size = PyBytes_GET_SIZE(output);
3895 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003896 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003897 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003898 Py_DECREF(output);
3899 return 0;
3900 }
3901 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003902 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003903}
3904
3905
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003906int
3907PyUnicode_FSDecoder(PyObject* arg, void* addr)
3908{
Brett Cannona5711202016-09-06 19:36:01 -07003909 int is_buffer = 0;
3910 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003911 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003912 if (arg == NULL) {
3913 Py_DECREF(*(PyObject**)addr);
3914 return 1;
3915 }
Brett Cannona5711202016-09-06 19:36:01 -07003916
3917 is_buffer = PyObject_CheckBuffer(arg);
3918 if (!is_buffer) {
3919 path = PyOS_FSPath(arg);
3920 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003921 return 0;
3922 }
Brett Cannona5711202016-09-06 19:36:01 -07003923 }
3924 else {
3925 path = arg;
3926 Py_INCREF(arg);
3927 }
3928
3929 if (PyUnicode_Check(path)) {
3930 if (PyUnicode_READY(path) == -1) {
3931 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003932 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003933 }
3934 output = path;
3935 }
3936 else if (PyBytes_Check(path) || is_buffer) {
3937 PyObject *path_bytes = NULL;
3938
3939 if (!PyBytes_Check(path) &&
3940 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3941 "path should be string, bytes, or os.PathLike, not %.200s",
3942 Py_TYPE(arg)->tp_name)) {
3943 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003944 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003945 }
3946 path_bytes = PyBytes_FromObject(path);
3947 Py_DECREF(path);
3948 if (!path_bytes) {
3949 return 0;
3950 }
3951 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3952 PyBytes_GET_SIZE(path_bytes));
3953 Py_DECREF(path_bytes);
3954 if (!output) {
3955 return 0;
3956 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003957 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003958 else {
3959 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003960 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003961 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003962 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003963 return 0;
3964 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003965 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003966 Py_DECREF(output);
3967 return 0;
3968 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003969 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003970 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003971 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003972 Py_DECREF(output);
3973 return 0;
3974 }
3975 *(PyObject**)addr = output;
3976 return Py_CLEANUP_SUPPORTED;
3977}
3978
3979
Martin v. Löwis5b222132007-06-10 09:51:05 +00003980char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003982{
Christian Heimesf3863112007-11-22 07:46:41 +00003983 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003984
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003985 if (!PyUnicode_Check(unicode)) {
3986 PyErr_BadArgument();
3987 return NULL;
3988 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003989 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003990 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003991
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003992 if (PyUnicode_UTF8(unicode) == NULL) {
3993 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003994 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995 if (bytes == NULL)
3996 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003997 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3998 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003999 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004000 Py_DECREF(bytes);
4001 return NULL;
4002 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004003 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02004004 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004005 PyBytes_AS_STRING(bytes),
4006 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 Py_DECREF(bytes);
4008 }
4009
4010 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004011 *psize = PyUnicode_UTF8_LENGTH(unicode);
4012 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004013}
4014
4015char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4019}
4020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004021Py_UNICODE *
4022PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004024 const unsigned char *one_byte;
4025#if SIZEOF_WCHAR_T == 4
4026 const Py_UCS2 *two_bytes;
4027#else
4028 const Py_UCS4 *four_bytes;
4029 const Py_UCS4 *ucs4_end;
4030 Py_ssize_t num_surrogates;
4031#endif
4032 wchar_t *w;
4033 wchar_t *wchar_end;
4034
4035 if (!PyUnicode_Check(unicode)) {
4036 PyErr_BadArgument();
4037 return NULL;
4038 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004039 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004041 assert(_PyUnicode_KIND(unicode) != 0);
4042 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004043
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004044 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004046 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4047 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048 num_surrogates = 0;
4049
4050 for (; four_bytes < ucs4_end; ++four_bytes) {
4051 if (*four_bytes > 0xFFFF)
4052 ++num_surrogates;
4053 }
4054
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004055 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4056 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4057 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004058 PyErr_NoMemory();
4059 return NULL;
4060 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004061 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004063 w = _PyUnicode_WSTR(unicode);
4064 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4065 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004066 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4067 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004068 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004070 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4071 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004072 }
4073 else
4074 *w = *four_bytes;
4075
4076 if (w > wchar_end) {
4077 assert(0 && "Miscalculated string end");
4078 }
4079 }
4080 *w = 0;
4081#else
4082 /* sizeof(wchar_t) == 4 */
4083 Py_FatalError("Impossible unicode object state, wstr and str "
4084 "should share memory already.");
4085 return NULL;
4086#endif
4087 }
4088 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004089 if ((size_t)_PyUnicode_LENGTH(unicode) >
4090 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4091 PyErr_NoMemory();
4092 return NULL;
4093 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004094 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4095 (_PyUnicode_LENGTH(unicode) + 1));
4096 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097 PyErr_NoMemory();
4098 return NULL;
4099 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004100 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4101 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4102 w = _PyUnicode_WSTR(unicode);
4103 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004104
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004105 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4106 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004107 for (; w < wchar_end; ++one_byte, ++w)
4108 *w = *one_byte;
4109 /* null-terminate the wstr */
4110 *w = 0;
4111 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004112 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004113#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004114 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004115 for (; w < wchar_end; ++two_bytes, ++w)
4116 *w = *two_bytes;
4117 /* null-terminate the wstr */
4118 *w = 0;
4119#else
4120 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004121 PyObject_FREE(_PyUnicode_WSTR(unicode));
4122 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004123 Py_FatalError("Impossible unicode object state, wstr "
4124 "and str should share memory already.");
4125 return NULL;
4126#endif
4127 }
4128 else {
4129 assert(0 && "This should never happen.");
4130 }
4131 }
4132 }
4133 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004134 *size = PyUnicode_WSTR_LENGTH(unicode);
4135 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004136}
4137
Alexander Belopolsky40018472011-02-26 01:02:56 +00004138Py_UNICODE *
4139PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004141 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142}
4143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004144
Alexander Belopolsky40018472011-02-26 01:02:56 +00004145Py_ssize_t
4146PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147{
4148 if (!PyUnicode_Check(unicode)) {
4149 PyErr_BadArgument();
4150 goto onError;
4151 }
4152 return PyUnicode_GET_SIZE(unicode);
4153
Benjamin Peterson29060642009-01-31 22:14:21 +00004154 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155 return -1;
4156}
4157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004158Py_ssize_t
4159PyUnicode_GetLength(PyObject *unicode)
4160{
Victor Stinner07621332012-06-16 04:53:46 +02004161 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004162 PyErr_BadArgument();
4163 return -1;
4164 }
Victor Stinner07621332012-06-16 04:53:46 +02004165 if (PyUnicode_READY(unicode) == -1)
4166 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004167 return PyUnicode_GET_LENGTH(unicode);
4168}
4169
4170Py_UCS4
4171PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4172{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004173 void *data;
4174 int kind;
4175
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004176 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4177 PyErr_BadArgument();
4178 return (Py_UCS4)-1;
4179 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004180 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004181 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004182 return (Py_UCS4)-1;
4183 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004184 data = PyUnicode_DATA(unicode);
4185 kind = PyUnicode_KIND(unicode);
4186 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004187}
4188
4189int
4190PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4191{
4192 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004193 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004194 return -1;
4195 }
Victor Stinner488fa492011-12-12 00:01:39 +01004196 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004197 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004198 PyErr_SetString(PyExc_IndexError, "string index out of range");
4199 return -1;
4200 }
Victor Stinner488fa492011-12-12 00:01:39 +01004201 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004202 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004203 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4204 PyErr_SetString(PyExc_ValueError, "character out of range");
4205 return -1;
4206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004207 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4208 index, ch);
4209 return 0;
4210}
4211
Alexander Belopolsky40018472011-02-26 01:02:56 +00004212const char *
4213PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004214{
Victor Stinner42cb4622010-09-01 19:39:01 +00004215 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004216}
4217
Victor Stinner554f3f02010-06-16 23:33:54 +00004218/* create or adjust a UnicodeDecodeError */
4219static void
4220make_decode_exception(PyObject **exceptionObject,
4221 const char *encoding,
4222 const char *input, Py_ssize_t length,
4223 Py_ssize_t startpos, Py_ssize_t endpos,
4224 const char *reason)
4225{
4226 if (*exceptionObject == NULL) {
4227 *exceptionObject = PyUnicodeDecodeError_Create(
4228 encoding, input, length, startpos, endpos, reason);
4229 }
4230 else {
4231 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4232 goto onError;
4233 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4234 goto onError;
4235 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4236 goto onError;
4237 }
4238 return;
4239
4240onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004241 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004242}
4243
Steve Dowercc16be82016-09-08 10:35:16 -07004244#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245/* error handling callback helper:
4246 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004247 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248 and adjust various state variables.
4249 return 0 on success, -1 on error
4250*/
4251
Alexander Belopolsky40018472011-02-26 01:02:56 +00004252static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004253unicode_decode_call_errorhandler_wchar(
4254 const char *errors, PyObject **errorHandler,
4255 const char *encoding, const char *reason,
4256 const char **input, const char **inend, Py_ssize_t *startinpos,
4257 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4258 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004260 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261
4262 PyObject *restuple = NULL;
4263 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004264 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004265 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004266 Py_ssize_t requiredsize;
4267 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004268 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004269 wchar_t *repwstr;
4270 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004272 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4273 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004274
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004275 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 *errorHandler = PyCodec_LookupError(errors);
4277 if (*errorHandler == NULL)
4278 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279 }
4280
Victor Stinner554f3f02010-06-16 23:33:54 +00004281 make_decode_exception(exceptionObject,
4282 encoding,
4283 *input, *inend - *input,
4284 *startinpos, *endinpos,
4285 reason);
4286 if (*exceptionObject == NULL)
4287 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288
4289 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4290 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004291 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004292 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004293 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 }
4296 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004297 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004298
4299 /* Copy back the bytes variables, which might have been modified by the
4300 callback */
4301 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4302 if (!inputobj)
4303 goto onError;
4304 if (!PyBytes_Check(inputobj)) {
4305 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4306 }
4307 *input = PyBytes_AS_STRING(inputobj);
4308 insize = PyBytes_GET_SIZE(inputobj);
4309 *inend = *input + insize;
4310 /* we can DECREF safely, as the exception has another reference,
4311 so the object won't go away. */
4312 Py_DECREF(inputobj);
4313
4314 if (newpos<0)
4315 newpos = insize+newpos;
4316 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004317 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004318 goto onError;
4319 }
4320
4321 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4322 if (repwstr == NULL)
4323 goto onError;
4324 /* need more space? (at least enough for what we
4325 have+the replacement+the rest of the string (starting
4326 at the new input position), so we won't have to check space
4327 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004328 requiredsize = *outpos;
4329 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4330 goto overflow;
4331 requiredsize += repwlen;
4332 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4333 goto overflow;
4334 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004335 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004336 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004337 requiredsize = 2*outsize;
4338 if (unicode_resize(output, requiredsize) < 0)
4339 goto onError;
4340 }
4341 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4342 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004343 *endinpos = newpos;
4344 *inptr = *input + newpos;
4345
4346 /* we made it! */
4347 Py_XDECREF(restuple);
4348 return 0;
4349
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004350 overflow:
4351 PyErr_SetString(PyExc_OverflowError,
4352 "decoded result is too long for a Python string");
4353
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004354 onError:
4355 Py_XDECREF(restuple);
4356 return -1;
4357}
Steve Dowercc16be82016-09-08 10:35:16 -07004358#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004359
4360static int
4361unicode_decode_call_errorhandler_writer(
4362 const char *errors, PyObject **errorHandler,
4363 const char *encoding, const char *reason,
4364 const char **input, const char **inend, Py_ssize_t *startinpos,
4365 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4366 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4367{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004368 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004369
4370 PyObject *restuple = NULL;
4371 PyObject *repunicode = NULL;
4372 Py_ssize_t insize;
4373 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004374 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004375 PyObject *inputobj = NULL;
4376
4377 if (*errorHandler == NULL) {
4378 *errorHandler = PyCodec_LookupError(errors);
4379 if (*errorHandler == NULL)
4380 goto onError;
4381 }
4382
4383 make_decode_exception(exceptionObject,
4384 encoding,
4385 *input, *inend - *input,
4386 *startinpos, *endinpos,
4387 reason);
4388 if (*exceptionObject == NULL)
4389 goto onError;
4390
4391 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4392 if (restuple == NULL)
4393 goto onError;
4394 if (!PyTuple_Check(restuple)) {
4395 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4396 goto onError;
4397 }
4398 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004399 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004400
4401 /* Copy back the bytes variables, which might have been modified by the
4402 callback */
4403 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4404 if (!inputobj)
4405 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004406 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004408 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004409 *input = PyBytes_AS_STRING(inputobj);
4410 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004411 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004412 /* we can DECREF safely, as the exception has another reference,
4413 so the object won't go away. */
4414 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004415
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004417 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004418 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004419 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004420 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004421 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422
Victor Stinner8f674cc2013-04-17 23:02:17 +02004423 if (PyUnicode_READY(repunicode) < 0)
4424 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004425 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004426 if (replen > 1) {
4427 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004428 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004429 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4430 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4431 goto onError;
4432 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004433 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004434 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004437 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004438
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004440 Py_XDECREF(restuple);
4441 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442
Benjamin Peterson29060642009-01-31 22:14:21 +00004443 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004445 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446}
4447
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004448/* --- UTF-7 Codec -------------------------------------------------------- */
4449
Antoine Pitrou244651a2009-05-04 18:56:13 +00004450/* See RFC2152 for details. We encode conservatively and decode liberally. */
4451
4452/* Three simple macros defining base-64. */
4453
4454/* Is c a base-64 character? */
4455
4456#define IS_BASE64(c) \
4457 (((c) >= 'A' && (c) <= 'Z') || \
4458 ((c) >= 'a' && (c) <= 'z') || \
4459 ((c) >= '0' && (c) <= '9') || \
4460 (c) == '+' || (c) == '/')
4461
4462/* given that c is a base-64 character, what is its base-64 value? */
4463
4464#define FROM_BASE64(c) \
4465 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4466 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4467 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4468 (c) == '+' ? 62 : 63)
4469
4470/* What is the base-64 character of the bottom 6 bits of n? */
4471
4472#define TO_BASE64(n) \
4473 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4474
4475/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4476 * decoded as itself. We are permissive on decoding; the only ASCII
4477 * byte not decoding to itself is the + which begins a base64
4478 * string. */
4479
4480#define DECODE_DIRECT(c) \
4481 ((c) <= 127 && (c) != '+')
4482
4483/* The UTF-7 encoder treats ASCII characters differently according to
4484 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4485 * the above). See RFC2152. This array identifies these different
4486 * sets:
4487 * 0 : "Set D"
4488 * alphanumeric and '(),-./:?
4489 * 1 : "Set O"
4490 * !"#$%&*;<=>@[]^_`{|}
4491 * 2 : "whitespace"
4492 * ht nl cr sp
4493 * 3 : special (must be base64 encoded)
4494 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4495 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004496
Tim Petersced69f82003-09-16 20:30:58 +00004497static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004498char utf7_category[128] = {
4499/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4500 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4501/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4502 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4503/* sp ! " # $ % & ' ( ) * + , - . / */
4504 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4505/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4506 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4507/* @ A B C D E F G H I J K L M N O */
4508 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4509/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4510 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4511/* ` a b c d e f g h i j k l m n o */
4512 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4513/* p q r s t u v w x y z { | } ~ del */
4514 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004515};
4516
Antoine Pitrou244651a2009-05-04 18:56:13 +00004517/* ENCODE_DIRECT: this character should be encoded as itself. The
4518 * answer depends on whether we are encoding set O as itself, and also
4519 * on whether we are encoding whitespace as itself. RFC2152 makes it
4520 * clear that the answers to these questions vary between
4521 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004522
Antoine Pitrou244651a2009-05-04 18:56:13 +00004523#define ENCODE_DIRECT(c, directO, directWS) \
4524 ((c) < 128 && (c) > 0 && \
4525 ((utf7_category[(c)] == 0) || \
4526 (directWS && (utf7_category[(c)] == 2)) || \
4527 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528
Alexander Belopolsky40018472011-02-26 01:02:56 +00004529PyObject *
4530PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004531 Py_ssize_t size,
4532 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004533{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004534 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4535}
4536
Antoine Pitrou244651a2009-05-04 18:56:13 +00004537/* The decoder. The only state we preserve is our read position,
4538 * i.e. how many characters we have consumed. So if we end in the
4539 * middle of a shift sequence we have to back off the read position
4540 * and the output to the beginning of the sequence, otherwise we lose
4541 * all the shift state (seen bits, number of bits seen, high
4542 * surrogate). */
4543
Alexander Belopolsky40018472011-02-26 01:02:56 +00004544PyObject *
4545PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004546 Py_ssize_t size,
4547 const char *errors,
4548 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004549{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004551 Py_ssize_t startinpos;
4552 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004553 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004554 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 const char *errmsg = "";
4556 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004557 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004558 unsigned int base64bits = 0;
4559 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004560 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 PyObject *errorHandler = NULL;
4562 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004563
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004564 if (size == 0) {
4565 if (consumed)
4566 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004567 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004568 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004569
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004570 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004571 _PyUnicodeWriter_Init(&writer);
4572 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004573
4574 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004575 e = s + size;
4576
4577 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004578 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004579 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004580 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004581
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 if (inShift) { /* in a base-64 section */
4583 if (IS_BASE64(ch)) { /* consume a base-64 character */
4584 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4585 base64bits += 6;
4586 s++;
4587 if (base64bits >= 16) {
4588 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004589 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 base64bits -= 16;
4591 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004592 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 if (surrogate) {
4594 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004595 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4596 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004597 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004598 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004599 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004600 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004601 }
4602 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004603 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004604 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004605 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 }
4607 }
Victor Stinner551ac952011-11-29 22:58:13 +01004608 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609 /* first surrogate */
4610 surrogate = outCh;
4611 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004613 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004614 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 }
4616 }
4617 }
4618 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004619 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004620 if (base64bits > 0) { /* left-over bits */
4621 if (base64bits >= 6) {
4622 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004623 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624 errmsg = "partial character in shift sequence";
4625 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004626 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004627 else {
4628 /* Some bits remain; they should be zero */
4629 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004630 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004631 errmsg = "non-zero padding bits in shift sequence";
4632 goto utf7Error;
4633 }
4634 }
4635 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004636 if (surrogate && DECODE_DIRECT(ch)) {
4637 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4638 goto onError;
4639 }
4640 surrogate = 0;
4641 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004642 /* '-' is absorbed; other terminating
4643 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004644 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004645 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004646 }
4647 }
4648 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004650 s++; /* consume '+' */
4651 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004652 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004653 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004654 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004655 }
4656 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004657 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004658 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004659 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004660 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004661 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004662 }
4663 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004665 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004666 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004667 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004668 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004669 else {
4670 startinpos = s-starts;
4671 s++;
4672 errmsg = "unexpected special character";
4673 goto utf7Error;
4674 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004675 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004676utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004677 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004678 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004679 errors, &errorHandler,
4680 "utf7", errmsg,
4681 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004682 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004683 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004684 }
4685
Antoine Pitrou244651a2009-05-04 18:56:13 +00004686 /* end of string */
4687
4688 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4689 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004690 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004691 if (surrogate ||
4692 (base64bits >= 6) ||
4693 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004694 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004695 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004696 errors, &errorHandler,
4697 "utf7", "unterminated shift sequence",
4698 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004699 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004700 goto onError;
4701 if (s < e)
4702 goto restart;
4703 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004704 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004705
4706 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004707 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004708 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004709 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004710 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004711 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004712 writer.kind, writer.data, shiftOutStart);
4713 Py_XDECREF(errorHandler);
4714 Py_XDECREF(exc);
4715 _PyUnicodeWriter_Dealloc(&writer);
4716 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004717 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004718 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004719 }
4720 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004721 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004722 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004723 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004724
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004725 Py_XDECREF(errorHandler);
4726 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004727 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004728
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004730 Py_XDECREF(errorHandler);
4731 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004732 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004733 return NULL;
4734}
4735
4736
Alexander Belopolsky40018472011-02-26 01:02:56 +00004737PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004738_PyUnicode_EncodeUTF7(PyObject *str,
4739 int base64SetO,
4740 int base64WhiteSpace,
4741 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004742{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004743 int kind;
4744 void *data;
4745 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004746 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004747 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004748 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004749 unsigned int base64bits = 0;
4750 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004751 char * out;
4752 char * start;
4753
Benjamin Petersonbac79492012-01-14 13:34:47 -05004754 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004755 return NULL;
4756 kind = PyUnicode_KIND(str);
4757 data = PyUnicode_DATA(str);
4758 len = PyUnicode_GET_LENGTH(str);
4759
4760 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004761 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004762
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004763 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004764 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004765 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004766 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004767 if (v == NULL)
4768 return NULL;
4769
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004770 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004771 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004772 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004773
Antoine Pitrou244651a2009-05-04 18:56:13 +00004774 if (inShift) {
4775 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4776 /* shifting out */
4777 if (base64bits) { /* output remaining bits */
4778 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4779 base64buffer = 0;
4780 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004781 }
4782 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004783 /* Characters not in the BASE64 set implicitly unshift the sequence
4784 so no '-' is required, except if the character is itself a '-' */
4785 if (IS_BASE64(ch) || ch == '-') {
4786 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004787 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004788 *out++ = (char) ch;
4789 }
4790 else {
4791 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004792 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004793 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004794 else { /* not in a shift sequence */
4795 if (ch == '+') {
4796 *out++ = '+';
4797 *out++ = '-';
4798 }
4799 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4800 *out++ = (char) ch;
4801 }
4802 else {
4803 *out++ = '+';
4804 inShift = 1;
4805 goto encode_char;
4806 }
4807 }
4808 continue;
4809encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004810 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004811 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004812
Antoine Pitrou244651a2009-05-04 18:56:13 +00004813 /* code first surrogate */
4814 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004815 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004816 while (base64bits >= 6) {
4817 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4818 base64bits -= 6;
4819 }
4820 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004821 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004822 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004823 base64bits += 16;
4824 base64buffer = (base64buffer << 16) | ch;
4825 while (base64bits >= 6) {
4826 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4827 base64bits -= 6;
4828 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004829 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004830 if (base64bits)
4831 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4832 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004833 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004834 if (_PyBytes_Resize(&v, out - start) < 0)
4835 return NULL;
4836 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004837}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004838PyObject *
4839PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4840 Py_ssize_t size,
4841 int base64SetO,
4842 int base64WhiteSpace,
4843 const char *errors)
4844{
4845 PyObject *result;
4846 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4847 if (tmp == NULL)
4848 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004849 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004850 base64WhiteSpace, errors);
4851 Py_DECREF(tmp);
4852 return result;
4853}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004854
Antoine Pitrou244651a2009-05-04 18:56:13 +00004855#undef IS_BASE64
4856#undef FROM_BASE64
4857#undef TO_BASE64
4858#undef DECODE_DIRECT
4859#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004860
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861/* --- UTF-8 Codec -------------------------------------------------------- */
4862
Alexander Belopolsky40018472011-02-26 01:02:56 +00004863PyObject *
4864PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004865 Py_ssize_t size,
4866 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867{
Walter Dörwald69652032004-09-07 20:24:22 +00004868 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4869}
4870
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004871#include "stringlib/asciilib.h"
4872#include "stringlib/codecs.h"
4873#include "stringlib/undef.h"
4874
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004875#include "stringlib/ucs1lib.h"
4876#include "stringlib/codecs.h"
4877#include "stringlib/undef.h"
4878
4879#include "stringlib/ucs2lib.h"
4880#include "stringlib/codecs.h"
4881#include "stringlib/undef.h"
4882
4883#include "stringlib/ucs4lib.h"
4884#include "stringlib/codecs.h"
4885#include "stringlib/undef.h"
4886
Antoine Pitrouab868312009-01-10 15:40:25 +00004887/* Mask to quickly check whether a C 'long' contains a
4888 non-ASCII, UTF8-encoded char. */
4889#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004890# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004891#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004892# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004893#else
4894# error C 'long' size should be either 4 or 8!
4895#endif
4896
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004897static Py_ssize_t
4898ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004899{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004900 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004901 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004902
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004903 /*
4904 * Issue #17237: m68k is a bit different from most architectures in
4905 * that objects do not use "natural alignment" - for example, int and
4906 * long are only aligned at 2-byte boundaries. Therefore the assert()
4907 * won't work; also, tests have shown that skipping the "optimised
4908 * version" will even speed up m68k.
4909 */
4910#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004912 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4913 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914 /* Fast path, see in STRINGLIB(utf8_decode) for
4915 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004916 /* Help allocation */
4917 const char *_p = p;
4918 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004919 while (_p < aligned_end) {
4920 unsigned long value = *(const unsigned long *) _p;
4921 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004923 *((unsigned long *)q) = value;
4924 _p += SIZEOF_LONG;
4925 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004926 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004927 p = _p;
4928 while (p < end) {
4929 if ((unsigned char)*p & 0x80)
4930 break;
4931 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004933 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004935#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004936#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004937 while (p < end) {
4938 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4939 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004940 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004941 /* Help allocation */
4942 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004943 while (_p < aligned_end) {
4944 unsigned long value = *(unsigned long *) _p;
4945 if (value & ASCII_CHAR_MASK)
4946 break;
4947 _p += SIZEOF_LONG;
4948 }
4949 p = _p;
4950 if (_p == end)
4951 break;
4952 }
4953 if ((unsigned char)*p & 0x80)
4954 break;
4955 ++p;
4956 }
4957 memcpy(dest, start, p - start);
4958 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959}
Antoine Pitrouab868312009-01-10 15:40:25 +00004960
Victor Stinner785938e2011-12-11 20:09:03 +01004961PyObject *
4962PyUnicode_DecodeUTF8Stateful(const char *s,
4963 Py_ssize_t size,
4964 const char *errors,
4965 Py_ssize_t *consumed)
4966{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004967 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004968 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004969 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004970
4971 Py_ssize_t startinpos;
4972 Py_ssize_t endinpos;
4973 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004974 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004975 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004976 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004977
4978 if (size == 0) {
4979 if (consumed)
4980 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004981 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004982 }
4983
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004984 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4985 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004986 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004987 *consumed = 1;
4988 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004989 }
4990
Victor Stinner8f674cc2013-04-17 23:02:17 +02004991 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004992 writer.min_length = size;
4993 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004994 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004995
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004996 writer.pos = ascii_decode(s, end, writer.data);
4997 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004998 while (s < end) {
4999 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005000 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005001
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005002 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005003 if (PyUnicode_IS_ASCII(writer.buffer))
5004 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005006 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005007 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005008 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 } else {
5010 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005011 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012 }
5013
5014 switch (ch) {
5015 case 0:
5016 if (s == end || consumed)
5017 goto End;
5018 errmsg = "unexpected end of data";
5019 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005020 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005021 break;
5022 case 1:
5023 errmsg = "invalid start byte";
5024 startinpos = s - starts;
5025 endinpos = startinpos + 1;
5026 break;
5027 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005028 case 3:
5029 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005030 errmsg = "invalid continuation byte";
5031 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005032 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005033 break;
5034 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005035 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005036 goto onError;
5037 continue;
5038 }
5039
Victor Stinner1d65d912015-10-05 13:43:50 +02005040 if (error_handler == _Py_ERROR_UNKNOWN)
5041 error_handler = get_error_handler(errors);
5042
5043 switch (error_handler) {
5044 case _Py_ERROR_IGNORE:
5045 s += (endinpos - startinpos);
5046 break;
5047
5048 case _Py_ERROR_REPLACE:
5049 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5050 goto onError;
5051 s += (endinpos - startinpos);
5052 break;
5053
5054 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005055 {
5056 Py_ssize_t i;
5057
Victor Stinner1d65d912015-10-05 13:43:50 +02005058 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5059 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005060 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005061 ch = (Py_UCS4)(unsigned char)(starts[i]);
5062 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5063 ch + 0xdc00);
5064 writer.pos++;
5065 }
5066 s += (endinpos - startinpos);
5067 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005068 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005069
5070 default:
5071 if (unicode_decode_call_errorhandler_writer(
5072 errors, &error_handler_obj,
5073 "utf-8", errmsg,
5074 &starts, &end, &startinpos, &endinpos, &exc, &s,
5075 &writer))
5076 goto onError;
5077 }
Victor Stinner785938e2011-12-11 20:09:03 +01005078 }
5079
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005080End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 if (consumed)
5082 *consumed = s - starts;
5083
Victor Stinner1d65d912015-10-05 13:43:50 +02005084 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005085 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005086 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005087
5088onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005089 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005090 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005091 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005092 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005093}
5094
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005095#ifdef __APPLE__
5096
5097/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005098 used to decode the command line arguments on Mac OS X.
5099
5100 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005101 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005102
5103wchar_t*
5104_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5105{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005106 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005107 wchar_t *unicode;
5108 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005109
5110 /* Note: size will always be longer than the resulting Unicode
5111 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005112 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005113 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005114 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005115 if (!unicode)
5116 return NULL;
5117
5118 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005119 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005120 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005121 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005122 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005123#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005124 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005125#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005126 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005127#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005128 if (ch > 0xFF) {
5129#if SIZEOF_WCHAR_T == 4
5130 assert(0);
5131#else
5132 assert(Py_UNICODE_IS_SURROGATE(ch));
5133 /* compute and append the two surrogates: */
5134 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5135 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5136#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005137 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005138 else {
5139 if (!ch && s == e)
5140 break;
5141 /* surrogateescape */
5142 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5143 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005144 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005145 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005146 return unicode;
5147}
5148
5149#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005151/* Primary internal function which creates utf8 encoded bytes objects.
5152
5153 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005154 and allocate exactly as much space needed at the end. Else allocate the
5155 maximum possible needed (4 result bytes per Unicode character), and return
5156 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005157*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005158PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005159_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160{
Victor Stinner6099a032011-12-18 14:22:26 +01005161 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005162 void *data;
5163 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005165 if (!PyUnicode_Check(unicode)) {
5166 PyErr_BadArgument();
5167 return NULL;
5168 }
5169
5170 if (PyUnicode_READY(unicode) == -1)
5171 return NULL;
5172
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005173 if (PyUnicode_UTF8(unicode))
5174 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5175 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005176
5177 kind = PyUnicode_KIND(unicode);
5178 data = PyUnicode_DATA(unicode);
5179 size = PyUnicode_GET_LENGTH(unicode);
5180
Benjamin Petersonead6b532011-12-20 17:23:42 -06005181 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005182 default:
5183 assert(0);
5184 case PyUnicode_1BYTE_KIND:
5185 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5186 assert(!PyUnicode_IS_ASCII(unicode));
5187 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5188 case PyUnicode_2BYTE_KIND:
5189 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5190 case PyUnicode_4BYTE_KIND:
5191 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005192 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193}
5194
Alexander Belopolsky40018472011-02-26 01:02:56 +00005195PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005196PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5197 Py_ssize_t size,
5198 const char *errors)
5199{
5200 PyObject *v, *unicode;
5201
5202 unicode = PyUnicode_FromUnicode(s, size);
5203 if (unicode == NULL)
5204 return NULL;
5205 v = _PyUnicode_AsUTF8String(unicode, errors);
5206 Py_DECREF(unicode);
5207 return v;
5208}
5209
5210PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005211PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005213 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214}
5215
Walter Dörwald41980ca2007-08-16 21:55:45 +00005216/* --- UTF-32 Codec ------------------------------------------------------- */
5217
5218PyObject *
5219PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 Py_ssize_t size,
5221 const char *errors,
5222 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005223{
5224 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5225}
5226
5227PyObject *
5228PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005229 Py_ssize_t size,
5230 const char *errors,
5231 int *byteorder,
5232 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005233{
5234 const char *starts = s;
5235 Py_ssize_t startinpos;
5236 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005237 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005238 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005239 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005240 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005241 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005242 PyObject *errorHandler = NULL;
5243 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005244
Walter Dörwald41980ca2007-08-16 21:55:45 +00005245 q = (unsigned char *)s;
5246 e = q + size;
5247
5248 if (byteorder)
5249 bo = *byteorder;
5250
5251 /* Check for BOM marks (U+FEFF) in the input and adjust current
5252 byte order setting accordingly. In native mode, the leading BOM
5253 mark is skipped, in all other modes, it is copied to the output
5254 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005255 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005256 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005257 if (bom == 0x0000FEFF) {
5258 bo = -1;
5259 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005260 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005261 else if (bom == 0xFFFE0000) {
5262 bo = 1;
5263 q += 4;
5264 }
5265 if (byteorder)
5266 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005267 }
5268
Victor Stinnere64322e2012-10-30 23:12:47 +01005269 if (q == e) {
5270 if (consumed)
5271 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005272 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005273 }
5274
Victor Stinnere64322e2012-10-30 23:12:47 +01005275#ifdef WORDS_BIGENDIAN
5276 le = bo < 0;
5277#else
5278 le = bo <= 0;
5279#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005280 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005281
Victor Stinner8f674cc2013-04-17 23:02:17 +02005282 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005283 writer.min_length = (e - q + 3) / 4;
5284 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005285 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005286
Victor Stinnere64322e2012-10-30 23:12:47 +01005287 while (1) {
5288 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005289 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005290
Victor Stinnere64322e2012-10-30 23:12:47 +01005291 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005292 enum PyUnicode_Kind kind = writer.kind;
5293 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005294 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005295 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005296 if (le) {
5297 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005298 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005299 if (ch > maxch)
5300 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005301 if (kind != PyUnicode_1BYTE_KIND &&
5302 Py_UNICODE_IS_SURROGATE(ch))
5303 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005304 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005305 q += 4;
5306 } while (q <= last);
5307 }
5308 else {
5309 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005310 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005311 if (ch > maxch)
5312 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005313 if (kind != PyUnicode_1BYTE_KIND &&
5314 Py_UNICODE_IS_SURROGATE(ch))
5315 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005316 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005317 q += 4;
5318 } while (q <= last);
5319 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005320 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005321 }
5322
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005323 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005324 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005325 startinpos = ((const char *)q) - starts;
5326 endinpos = startinpos + 4;
5327 }
5328 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005329 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005330 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005331 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005332 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005333 startinpos = ((const char *)q) - starts;
5334 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005335 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005336 else {
5337 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005338 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005339 goto onError;
5340 q += 4;
5341 continue;
5342 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005343 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005344 startinpos = ((const char *)q) - starts;
5345 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005347
5348 /* The remaining input chars are ignored if the callback
5349 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005350 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005352 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005354 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005356 }
5357
Walter Dörwald41980ca2007-08-16 21:55:45 +00005358 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005360
Walter Dörwald41980ca2007-08-16 21:55:45 +00005361 Py_XDECREF(errorHandler);
5362 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005363 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005364
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005366 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005367 Py_XDECREF(errorHandler);
5368 Py_XDECREF(exc);
5369 return NULL;
5370}
5371
5372PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005373_PyUnicode_EncodeUTF32(PyObject *str,
5374 const char *errors,
5375 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005376{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005377 enum PyUnicode_Kind kind;
5378 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005379 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005380 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005381 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005382#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005383 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005384#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005385 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005386#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005387 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005388 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005389 PyObject *errorHandler = NULL;
5390 PyObject *exc = NULL;
5391 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005392
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005393 if (!PyUnicode_Check(str)) {
5394 PyErr_BadArgument();
5395 return NULL;
5396 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005397 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005398 return NULL;
5399 kind = PyUnicode_KIND(str);
5400 data = PyUnicode_DATA(str);
5401 len = PyUnicode_GET_LENGTH(str);
5402
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005403 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005404 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005405 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005406 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005407 if (v == NULL)
5408 return NULL;
5409
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005410 /* output buffer is 4-bytes aligned */
5411 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005412 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005413 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005414 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005415 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005416 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005417
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005418 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005419 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005420 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005421 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005422 else
5423 encoding = "utf-32";
5424
5425 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005426 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5427 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005428 }
5429
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005430 pos = 0;
5431 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005432 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005433
5434 if (kind == PyUnicode_2BYTE_KIND) {
5435 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5436 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005437 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005438 else {
5439 assert(kind == PyUnicode_4BYTE_KIND);
5440 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5441 &out, native_ordering);
5442 }
5443 if (pos == len)
5444 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005445
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005446 rep = unicode_encode_call_errorhandler(
5447 errors, &errorHandler,
5448 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005449 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005450 if (!rep)
5451 goto error;
5452
5453 if (PyBytes_Check(rep)) {
5454 repsize = PyBytes_GET_SIZE(rep);
5455 if (repsize & 3) {
5456 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005457 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005458 "surrogates not allowed");
5459 goto error;
5460 }
5461 moreunits = repsize / 4;
5462 }
5463 else {
5464 assert(PyUnicode_Check(rep));
5465 if (PyUnicode_READY(rep) < 0)
5466 goto error;
5467 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5468 if (!PyUnicode_IS_ASCII(rep)) {
5469 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005470 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005471 "surrogates not allowed");
5472 goto error;
5473 }
5474 }
5475
5476 /* four bytes are reserved for each surrogate */
5477 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005478 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005479 Py_ssize_t morebytes = 4 * (moreunits - 1);
5480 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5481 /* integer overflow */
5482 PyErr_NoMemory();
5483 goto error;
5484 }
5485 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5486 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005487 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005488 }
5489
5490 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005491 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005492 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005493 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005494 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005495 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5496 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005497 }
5498
5499 Py_CLEAR(rep);
5500 }
5501
5502 /* Cut back to size actually needed. This is necessary for, for example,
5503 encoding of a string containing isolated surrogates and the 'ignore'
5504 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005505 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005506 if (nsize != PyBytes_GET_SIZE(v))
5507 _PyBytes_Resize(&v, nsize);
5508 Py_XDECREF(errorHandler);
5509 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005510 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005511 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005512 error:
5513 Py_XDECREF(rep);
5514 Py_XDECREF(errorHandler);
5515 Py_XDECREF(exc);
5516 Py_XDECREF(v);
5517 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005518}
5519
Alexander Belopolsky40018472011-02-26 01:02:56 +00005520PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005521PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5522 Py_ssize_t size,
5523 const char *errors,
5524 int byteorder)
5525{
5526 PyObject *result;
5527 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5528 if (tmp == NULL)
5529 return NULL;
5530 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5531 Py_DECREF(tmp);
5532 return result;
5533}
5534
5535PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005536PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005537{
Victor Stinnerb960b342011-11-20 19:12:52 +01005538 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005539}
5540
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541/* --- UTF-16 Codec ------------------------------------------------------- */
5542
Tim Peters772747b2001-08-09 22:21:55 +00005543PyObject *
5544PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005545 Py_ssize_t size,
5546 const char *errors,
5547 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548{
Walter Dörwald69652032004-09-07 20:24:22 +00005549 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5550}
5551
5552PyObject *
5553PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005554 Py_ssize_t size,
5555 const char *errors,
5556 int *byteorder,
5557 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005558{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005559 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005560 Py_ssize_t startinpos;
5561 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005562 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005563 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005564 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005565 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005566 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005567 PyObject *errorHandler = NULL;
5568 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005569 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570
Tim Peters772747b2001-08-09 22:21:55 +00005571 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005572 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573
5574 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005575 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005577 /* Check for BOM marks (U+FEFF) in the input and adjust current
5578 byte order setting accordingly. In native mode, the leading BOM
5579 mark is skipped, in all other modes, it is copied to the output
5580 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005581 if (bo == 0 && size >= 2) {
5582 const Py_UCS4 bom = (q[1] << 8) | q[0];
5583 if (bom == 0xFEFF) {
5584 q += 2;
5585 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005587 else if (bom == 0xFFFE) {
5588 q += 2;
5589 bo = 1;
5590 }
5591 if (byteorder)
5592 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005593 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594
Antoine Pitrou63065d72012-05-15 23:48:04 +02005595 if (q == e) {
5596 if (consumed)
5597 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005598 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005599 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005600
Christian Heimes743e0cd2012-10-17 23:52:17 +02005601#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005602 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005603 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005604#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005605 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005606 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005607#endif
Tim Peters772747b2001-08-09 22:21:55 +00005608
Antoine Pitrou63065d72012-05-15 23:48:04 +02005609 /* Note: size will always be longer than the resulting Unicode
5610 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005611 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005612 writer.min_length = (e - q + 1) / 2;
5613 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005614 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005615
Antoine Pitrou63065d72012-05-15 23:48:04 +02005616 while (1) {
5617 Py_UCS4 ch = 0;
5618 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005619 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005620 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005621 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005622 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005623 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005624 native_ordering);
5625 else
5626 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005627 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005628 native_ordering);
5629 } else if (kind == PyUnicode_2BYTE_KIND) {
5630 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005631 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005632 native_ordering);
5633 } else {
5634 assert(kind == PyUnicode_4BYTE_KIND);
5635 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005636 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005637 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005638 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005639 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005640
Antoine Pitrou63065d72012-05-15 23:48:04 +02005641 switch (ch)
5642 {
5643 case 0:
5644 /* remaining byte at the end? (size should be even) */
5645 if (q == e || consumed)
5646 goto End;
5647 errmsg = "truncated data";
5648 startinpos = ((const char *)q) - starts;
5649 endinpos = ((const char *)e) - starts;
5650 break;
5651 /* The remaining input chars are ignored if the callback
5652 chooses to skip the input */
5653 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005654 q -= 2;
5655 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005656 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005657 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005658 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005659 endinpos = ((const char *)e) - starts;
5660 break;
5661 case 2:
5662 errmsg = "illegal encoding";
5663 startinpos = ((const char *)q) - 2 - starts;
5664 endinpos = startinpos + 2;
5665 break;
5666 case 3:
5667 errmsg = "illegal UTF-16 surrogate";
5668 startinpos = ((const char *)q) - 4 - starts;
5669 endinpos = startinpos + 2;
5670 break;
5671 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005672 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005673 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 continue;
5675 }
5676
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005677 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005678 errors,
5679 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005680 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005681 &starts,
5682 (const char **)&e,
5683 &startinpos,
5684 &endinpos,
5685 &exc,
5686 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005687 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 }
5690
Antoine Pitrou63065d72012-05-15 23:48:04 +02005691End:
Walter Dörwald69652032004-09-07 20:24:22 +00005692 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005694
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005695 Py_XDECREF(errorHandler);
5696 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005697 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005700 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701 Py_XDECREF(errorHandler);
5702 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 return NULL;
5704}
5705
Tim Peters772747b2001-08-09 22:21:55 +00005706PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005707_PyUnicode_EncodeUTF16(PyObject *str,
5708 const char *errors,
5709 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005711 enum PyUnicode_Kind kind;
5712 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005713 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005714 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005715 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005716 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005717#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005718 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005719#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005720 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005721#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005722 const char *encoding;
5723 Py_ssize_t nsize, pos;
5724 PyObject *errorHandler = NULL;
5725 PyObject *exc = NULL;
5726 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005727
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005728 if (!PyUnicode_Check(str)) {
5729 PyErr_BadArgument();
5730 return NULL;
5731 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005732 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005733 return NULL;
5734 kind = PyUnicode_KIND(str);
5735 data = PyUnicode_DATA(str);
5736 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005737
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005738 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005739 if (kind == PyUnicode_4BYTE_KIND) {
5740 const Py_UCS4 *in = (const Py_UCS4 *)data;
5741 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005742 while (in < end) {
5743 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005744 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005745 }
5746 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005747 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005748 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005750 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005751 nsize = len + pairs + (byteorder == 0);
5752 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005753 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005755 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005757 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005758 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005759 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005760 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005761 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005762 }
5763 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005764 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005765 }
Tim Peters772747b2001-08-09 22:21:55 +00005766
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005767 if (kind == PyUnicode_1BYTE_KIND) {
5768 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5769 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005770 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005771
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005772 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005773 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005774 }
5775 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005776 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005777 }
5778 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005779 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005780 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005781
5782 pos = 0;
5783 while (pos < len) {
5784 Py_ssize_t repsize, moreunits;
5785
5786 if (kind == PyUnicode_2BYTE_KIND) {
5787 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5788 &out, native_ordering);
5789 }
5790 else {
5791 assert(kind == PyUnicode_4BYTE_KIND);
5792 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5793 &out, native_ordering);
5794 }
5795 if (pos == len)
5796 break;
5797
5798 rep = unicode_encode_call_errorhandler(
5799 errors, &errorHandler,
5800 encoding, "surrogates not allowed",
5801 str, &exc, pos, pos + 1, &pos);
5802 if (!rep)
5803 goto error;
5804
5805 if (PyBytes_Check(rep)) {
5806 repsize = PyBytes_GET_SIZE(rep);
5807 if (repsize & 1) {
5808 raise_encode_exception(&exc, encoding,
5809 str, pos - 1, pos,
5810 "surrogates not allowed");
5811 goto error;
5812 }
5813 moreunits = repsize / 2;
5814 }
5815 else {
5816 assert(PyUnicode_Check(rep));
5817 if (PyUnicode_READY(rep) < 0)
5818 goto error;
5819 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5820 if (!PyUnicode_IS_ASCII(rep)) {
5821 raise_encode_exception(&exc, encoding,
5822 str, pos - 1, pos,
5823 "surrogates not allowed");
5824 goto error;
5825 }
5826 }
5827
5828 /* two bytes are reserved for each surrogate */
5829 if (moreunits > 1) {
5830 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5831 Py_ssize_t morebytes = 2 * (moreunits - 1);
5832 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5833 /* integer overflow */
5834 PyErr_NoMemory();
5835 goto error;
5836 }
5837 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5838 goto error;
5839 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5840 }
5841
5842 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005843 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005844 out += moreunits;
5845 } else /* rep is unicode */ {
5846 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5847 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5848 &out, native_ordering);
5849 }
5850
5851 Py_CLEAR(rep);
5852 }
5853
5854 /* Cut back to size actually needed. This is necessary for, for example,
5855 encoding of a string containing isolated surrogates and the 'ignore' handler
5856 is used. */
5857 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5858 if (nsize != PyBytes_GET_SIZE(v))
5859 _PyBytes_Resize(&v, nsize);
5860 Py_XDECREF(errorHandler);
5861 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005862 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005863 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005864 error:
5865 Py_XDECREF(rep);
5866 Py_XDECREF(errorHandler);
5867 Py_XDECREF(exc);
5868 Py_XDECREF(v);
5869 return NULL;
5870#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871}
5872
Alexander Belopolsky40018472011-02-26 01:02:56 +00005873PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005874PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5875 Py_ssize_t size,
5876 const char *errors,
5877 int byteorder)
5878{
5879 PyObject *result;
5880 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5881 if (tmp == NULL)
5882 return NULL;
5883 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5884 Py_DECREF(tmp);
5885 return result;
5886}
5887
5888PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005889PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005891 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892}
5893
5894/* --- Unicode Escape Codec ----------------------------------------------- */
5895
Fredrik Lundh06d12682001-01-24 07:59:11 +00005896static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005897
Alexander Belopolsky40018472011-02-26 01:02:56 +00005898PyObject *
Eric V. Smith56466482016-10-31 14:46:26 -04005899_PyUnicode_DecodeUnicodeEscape(const char *s,
5900 Py_ssize_t size,
5901 const char *errors,
5902 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005904 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005905 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005907 PyObject *errorHandler = NULL;
5908 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005909
Eric V. Smith56466482016-10-31 14:46:26 -04005910 // so we can remember if we've seen an invalid escape char or not
5911 *first_invalid_escape = NULL;
5912
Victor Stinner62ec3312016-09-06 17:04:34 -07005913 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005914 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005915 }
5916 /* Escaped strings will always be longer than the resulting
5917 Unicode string, so we start with size here and then reduce the
5918 length after conversion to the true value.
5919 (but if the error callback returns a long replacement string
5920 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005921 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005922 writer.min_length = size;
5923 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5924 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005925 }
5926
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 end = s + size;
5928 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005929 unsigned char c = (unsigned char) *s++;
5930 Py_UCS4 ch;
5931 int count;
5932 Py_ssize_t startinpos;
5933 Py_ssize_t endinpos;
5934 const char *message;
5935
5936#define WRITE_ASCII_CHAR(ch) \
5937 do { \
5938 assert(ch <= 127); \
5939 assert(writer.pos < writer.size); \
5940 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5941 } while(0)
5942
5943#define WRITE_CHAR(ch) \
5944 do { \
5945 if (ch <= writer.maxchar) { \
5946 assert(writer.pos < writer.size); \
5947 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5948 } \
5949 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5950 goto onError; \
5951 } \
5952 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953
5954 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005955 if (c != '\\') {
5956 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 continue;
5958 }
5959
Victor Stinner62ec3312016-09-06 17:04:34 -07005960 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005962 if (s >= end) {
5963 message = "\\ at end of string";
5964 goto error;
5965 }
5966 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005967
Victor Stinner62ec3312016-09-06 17:04:34 -07005968 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005969 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005972 case '\n': continue;
5973 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5974 case '\'': WRITE_ASCII_CHAR('\''); continue;
5975 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5976 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005977 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005978 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5979 case 't': WRITE_ASCII_CHAR('\t'); continue;
5980 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5981 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005982 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005983 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005984 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005985 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 case '0': case '1': case '2': case '3':
5989 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005990 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005991 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005992 ch = (ch<<3) + *s++ - '0';
5993 if (s < end && '0' <= *s && *s <= '7') {
5994 ch = (ch<<3) + *s++ - '0';
5995 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005997 WRITE_CHAR(ch);
5998 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 /* hex escapes */
6001 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006003 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006004 message = "truncated \\xXX escape";
6005 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006009 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006010 message = "truncated \\uXXXX escape";
6011 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006014 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006015 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006016 message = "truncated \\UXXXXXXXX escape";
6017 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006018 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006019 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006020 ch <<= 4;
6021 if (c >= '0' && c <= '9') {
6022 ch += c - '0';
6023 }
6024 else if (c >= 'a' && c <= 'f') {
6025 ch += c - ('a' - 10);
6026 }
6027 else if (c >= 'A' && c <= 'F') {
6028 ch += c - ('A' - 10);
6029 }
6030 else {
6031 break;
6032 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006033 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006034 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006035 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006036 }
6037
6038 /* when we get here, ch is a 32-bit unicode character */
6039 if (ch > MAX_UNICODE) {
6040 message = "illegal Unicode character";
6041 goto error;
6042 }
6043
6044 WRITE_CHAR(ch);
6045 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006046
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006048 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006049 if (ucnhash_CAPI == NULL) {
6050 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006051 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6052 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006053 if (ucnhash_CAPI == NULL) {
6054 PyErr_SetString(
6055 PyExc_UnicodeError,
6056 "\\N escapes not supported (can't load unicodedata module)"
6057 );
6058 goto onError;
6059 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006060 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006061
6062 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006063 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006064 const char *start = ++s;
6065 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006066 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006067 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006068 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006069 namelen = s - start;
6070 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006071 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006072 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006073 ch = 0xffffffff; /* in case 'getcode' messes up */
6074 if (namelen <= INT_MAX &&
6075 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6076 &ch, 0)) {
6077 assert(ch <= MAX_UNICODE);
6078 WRITE_CHAR(ch);
6079 continue;
6080 }
6081 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006082 }
6083 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006084 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006085
6086 default:
Eric V. Smith56466482016-10-31 14:46:26 -04006087 if (*first_invalid_escape == NULL) {
6088 *first_invalid_escape = s-1; /* Back up one char, since we've
6089 already incremented s. */
6090 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006091 WRITE_ASCII_CHAR('\\');
6092 WRITE_CHAR(c);
6093 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006095
6096 error:
6097 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006098 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006099 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006100 errors, &errorHandler,
6101 "unicodeescape", message,
6102 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006103 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006104 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006105 }
6106 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6107 goto onError;
6108 }
6109
6110#undef WRITE_ASCII_CHAR
6111#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006113
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006114 Py_XDECREF(errorHandler);
6115 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006116 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006117
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006119 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120 Py_XDECREF(errorHandler);
6121 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 return NULL;
6123}
6124
Eric V. Smith56466482016-10-31 14:46:26 -04006125PyObject *
6126PyUnicode_DecodeUnicodeEscape(const char *s,
6127 Py_ssize_t size,
6128 const char *errors)
6129{
6130 const char *first_invalid_escape;
6131 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6132 &first_invalid_escape);
6133 if (result == NULL)
6134 return NULL;
6135 if (first_invalid_escape != NULL) {
6136 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6137 "invalid escape sequence '\\%c'",
6138 *first_invalid_escape) < 0) {
6139 Py_DECREF(result);
6140 return NULL;
6141 }
6142 }
6143 return result;
6144}
6145
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146/* Return a Unicode-Escape string version of the Unicode object.
6147
6148 If quotes is true, the string is enclosed in u"" or u'' quotes as
6149 appropriate.
6150
6151*/
6152
Alexander Belopolsky40018472011-02-26 01:02:56 +00006153PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006156 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006157 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006159 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006160 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006161 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162
Ezio Melottie7f90372012-10-05 03:33:31 +03006163 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006164 escape.
6165
Ezio Melottie7f90372012-10-05 03:33:31 +03006166 For UCS1 strings it's '\xxx', 4 bytes per source character.
6167 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6168 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006169 */
6170
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006171 if (!PyUnicode_Check(unicode)) {
6172 PyErr_BadArgument();
6173 return NULL;
6174 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006175 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006176 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006177 }
Victor Stinner358af132015-10-12 22:36:57 +02006178
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006179 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006180 if (len == 0) {
6181 return PyBytes_FromStringAndSize(NULL, 0);
6182 }
6183
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006184 kind = PyUnicode_KIND(unicode);
6185 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006186 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6187 bytes, and 1 byte characters 4. */
6188 expandsize = kind * 2 + 2;
6189 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) {
6190 return PyErr_NoMemory();
6191 }
6192 repr = PyBytes_FromStringAndSize(NULL, 2 + expandsize * len + 1);
6193 if (repr == NULL) {
6194 return NULL;
6195 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006196
Victor Stinner62ec3312016-09-06 17:04:34 -07006197 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006198 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006199 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006200
Victor Stinner62ec3312016-09-06 17:04:34 -07006201 /* U+0000-U+00ff range */
6202 if (ch < 0x100) {
6203 if (ch >= ' ' && ch < 127) {
6204 if (ch != '\\') {
6205 /* Copy printable US ASCII as-is */
6206 *p++ = (char) ch;
6207 }
6208 /* Escape backslashes */
6209 else {
6210 *p++ = '\\';
6211 *p++ = '\\';
6212 }
6213 }
Victor Stinner358af132015-10-12 22:36:57 +02006214
Victor Stinner62ec3312016-09-06 17:04:34 -07006215 /* Map special whitespace to '\t', \n', '\r' */
6216 else if (ch == '\t') {
6217 *p++ = '\\';
6218 *p++ = 't';
6219 }
6220 else if (ch == '\n') {
6221 *p++ = '\\';
6222 *p++ = 'n';
6223 }
6224 else if (ch == '\r') {
6225 *p++ = '\\';
6226 *p++ = 'r';
6227 }
6228
6229 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6230 else {
6231 *p++ = '\\';
6232 *p++ = 'x';
6233 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6234 *p++ = Py_hexdigits[ch & 0x000F];
6235 }
Tim Petersced69f82003-09-16 20:30:58 +00006236 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006237 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6238 else if (ch < 0x10000) {
6239 /* U+0100-U+ffff */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 *p++ = '\\';
6241 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006242 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6243 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6244 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6245 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006247 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6248 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006249
Victor Stinner62ec3312016-09-06 17:04:34 -07006250 /* Make sure that the first two digits are zero */
6251 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006252 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006253 *p++ = 'U';
6254 *p++ = '0';
6255 *p++ = '0';
6256 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6257 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6258 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6259 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6260 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6261 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264
Victor Stinner62ec3312016-09-06 17:04:34 -07006265 assert(p - PyBytes_AS_STRING(repr) > 0);
6266 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6267 return NULL;
6268 }
6269 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270}
6271
Alexander Belopolsky40018472011-02-26 01:02:56 +00006272PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006273PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6274 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006276 PyObject *result;
6277 PyObject *tmp = PyUnicode_FromUnicode(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006278 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006280 }
6281
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006282 result = PyUnicode_AsUnicodeEscapeString(tmp);
6283 Py_DECREF(tmp);
6284 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285}
6286
6287/* --- Raw Unicode Escape Codec ------------------------------------------- */
6288
Alexander Belopolsky40018472011-02-26 01:02:56 +00006289PyObject *
6290PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006291 Py_ssize_t size,
6292 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006294 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006295 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006297 PyObject *errorHandler = NULL;
6298 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006299
Victor Stinner62ec3312016-09-06 17:04:34 -07006300 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006301 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006302 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006303
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 /* Escaped strings will always be longer than the resulting
6305 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006306 length after conversion to the true value. (But decoding error
6307 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006308 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006309 writer.min_length = size;
6310 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6311 goto onError;
6312 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006313
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 end = s + size;
6315 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006316 unsigned char c = (unsigned char) *s++;
6317 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006318 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006319 Py_ssize_t startinpos;
6320 Py_ssize_t endinpos;
6321 const char *message;
6322
6323#define WRITE_CHAR(ch) \
6324 do { \
6325 if (ch <= writer.maxchar) { \
6326 assert(writer.pos < writer.size); \
6327 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6328 } \
6329 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6330 goto onError; \
6331 } \
6332 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333
Benjamin Peterson29060642009-01-31 22:14:21 +00006334 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006335 if (c != '\\' || s >= end) {
6336 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006338 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006339
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 c = (unsigned char) *s++;
6341 if (c == 'u') {
6342 count = 4;
6343 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006345 else if (c == 'U') {
6346 count = 8;
6347 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006348 }
6349 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006350 assert(writer.pos < writer.size);
6351 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6352 WRITE_CHAR(c);
6353 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006354 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006355 startinpos = s - starts - 2;
6356
6357 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6358 for (ch = 0; count && s < end; ++s, --count) {
6359 c = (unsigned char)*s;
6360 ch <<= 4;
6361 if (c >= '0' && c <= '9') {
6362 ch += c - '0';
6363 }
6364 else if (c >= 'a' && c <= 'f') {
6365 ch += c - ('a' - 10);
6366 }
6367 else if (c >= 'A' && c <= 'F') {
6368 ch += c - ('A' - 10);
6369 }
6370 else {
6371 break;
6372 }
6373 }
6374 if (!count) {
6375 if (ch <= MAX_UNICODE) {
6376 WRITE_CHAR(ch);
6377 continue;
6378 }
6379 message = "\\Uxxxxxxxx out of range";
6380 }
6381
6382 endinpos = s-starts;
6383 writer.min_length = end - s + writer.pos;
6384 if (unicode_decode_call_errorhandler_writer(
6385 errors, &errorHandler,
6386 "rawunicodeescape", message,
6387 &starts, &end, &startinpos, &endinpos, &exc, &s,
6388 &writer)) {
6389 goto onError;
6390 }
6391 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6392 goto onError;
6393 }
6394
6395#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006397 Py_XDECREF(errorHandler);
6398 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006399 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006400
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006402 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006403 Py_XDECREF(errorHandler);
6404 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006406
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407}
6408
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006409
Alexander Belopolsky40018472011-02-26 01:02:56 +00006410PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412{
Victor Stinner62ec3312016-09-06 17:04:34 -07006413 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006415 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006416 int kind;
6417 void *data;
6418 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006420 if (!PyUnicode_Check(unicode)) {
6421 PyErr_BadArgument();
6422 return NULL;
6423 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006424 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006425 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006426 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006427 kind = PyUnicode_KIND(unicode);
6428 data = PyUnicode_DATA(unicode);
6429 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006430 if (kind == PyUnicode_1BYTE_KIND) {
6431 return PyBytes_FromStringAndSize(data, len);
6432 }
Victor Stinner0e368262011-11-10 20:12:49 +01006433
Victor Stinner62ec3312016-09-06 17:04:34 -07006434 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6435 bytes, and 1 byte characters 4. */
6436 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006437
Victor Stinner62ec3312016-09-06 17:04:34 -07006438 if (len > PY_SSIZE_T_MAX / expandsize) {
6439 return PyErr_NoMemory();
6440 }
6441 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6442 if (repr == NULL) {
6443 return NULL;
6444 }
6445 if (len == 0) {
6446 return repr;
6447 }
6448
6449 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006450 for (pos = 0; pos < len; pos++) {
6451 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006452
Victor Stinner62ec3312016-09-06 17:04:34 -07006453 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6454 if (ch < 0x100) {
6455 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006456 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006457 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6458 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 *p++ = '\\';
6460 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006461 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6462 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6463 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6464 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006466 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6467 else {
6468 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6469 *p++ = '\\';
6470 *p++ = 'U';
6471 *p++ = '0';
6472 *p++ = '0';
6473 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6474 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6475 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6476 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6477 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6478 *p++ = Py_hexdigits[ch & 15];
6479 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006481
Victor Stinner62ec3312016-09-06 17:04:34 -07006482 assert(p > PyBytes_AS_STRING(repr));
6483 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6484 return NULL;
6485 }
6486 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487}
6488
Alexander Belopolsky40018472011-02-26 01:02:56 +00006489PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006490PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6491 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006493 PyObject *result;
6494 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6495 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006496 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006497 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6498 Py_DECREF(tmp);
6499 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500}
6501
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006502/* --- Unicode Internal Codec ------------------------------------------- */
6503
Alexander Belopolsky40018472011-02-26 01:02:56 +00006504PyObject *
6505_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006506 Py_ssize_t size,
6507 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006508{
6509 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006510 Py_ssize_t startinpos;
6511 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006512 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006513 const char *end;
6514 const char *reason;
6515 PyObject *errorHandler = NULL;
6516 PyObject *exc = NULL;
6517
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006518 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006519 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006520 1))
6521 return NULL;
6522
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006523 if (size == 0)
6524 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006525
Victor Stinner8f674cc2013-04-17 23:02:17 +02006526 _PyUnicodeWriter_Init(&writer);
6527 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6528 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006530 }
6531 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006532
Victor Stinner8f674cc2013-04-17 23:02:17 +02006533 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006534 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006535 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006536 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006537 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006538 endinpos = end-starts;
6539 reason = "truncated input";
6540 goto error;
6541 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006542 /* We copy the raw representation one byte at a time because the
6543 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006544 ((char *) &uch)[0] = s[0];
6545 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006546#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006547 ((char *) &uch)[2] = s[2];
6548 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006549#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006550 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006551#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006552 /* We have to sanity check the raw data, otherwise doom looms for
6553 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006554 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006555 endinpos = s - starts + Py_UNICODE_SIZE;
6556 reason = "illegal code point (> 0x10FFFF)";
6557 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006558 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006559#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006560 s += Py_UNICODE_SIZE;
6561#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006562 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006563 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006564 Py_UNICODE uch2;
6565 ((char *) &uch2)[0] = s[0];
6566 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006567 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006568 {
Victor Stinner551ac952011-11-29 22:58:13 +01006569 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006570 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006571 }
6572 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006573#endif
6574
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006575 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006576 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006577 continue;
6578
6579 error:
6580 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006581 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006582 errors, &errorHandler,
6583 "unicode_internal", reason,
6584 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006585 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006586 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006587 }
6588
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006589 Py_XDECREF(errorHandler);
6590 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006591 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006592
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006594 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006595 Py_XDECREF(errorHandler);
6596 Py_XDECREF(exc);
6597 return NULL;
6598}
6599
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600/* --- Latin-1 Codec ------------------------------------------------------ */
6601
Alexander Belopolsky40018472011-02-26 01:02:56 +00006602PyObject *
6603PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006604 Py_ssize_t size,
6605 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006608 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609}
6610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006611/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006612static void
6613make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006614 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006615 PyObject *unicode,
6616 Py_ssize_t startpos, Py_ssize_t endpos,
6617 const char *reason)
6618{
6619 if (*exceptionObject == NULL) {
6620 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006621 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006622 encoding, unicode, startpos, endpos, reason);
6623 }
6624 else {
6625 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6626 goto onError;
6627 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6628 goto onError;
6629 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6630 goto onError;
6631 return;
6632 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006633 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006634 }
6635}
6636
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006637/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006638static void
6639raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006640 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006641 PyObject *unicode,
6642 Py_ssize_t startpos, Py_ssize_t endpos,
6643 const char *reason)
6644{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006645 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006646 encoding, unicode, startpos, endpos, reason);
6647 if (*exceptionObject != NULL)
6648 PyCodec_StrictErrors(*exceptionObject);
6649}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006650
6651/* error handling callback helper:
6652 build arguments, call the callback and check the arguments,
6653 put the result into newpos and return the replacement string, which
6654 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006655static PyObject *
6656unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006657 PyObject **errorHandler,
6658 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006659 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006660 Py_ssize_t startpos, Py_ssize_t endpos,
6661 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006662{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006663 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006664 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665 PyObject *restuple;
6666 PyObject *resunicode;
6667
6668 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006670 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006672 }
6673
Benjamin Petersonbac79492012-01-14 13:34:47 -05006674 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006675 return NULL;
6676 len = PyUnicode_GET_LENGTH(unicode);
6677
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006678 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006679 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006680 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006681 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006682
6683 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006688 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 Py_DECREF(restuple);
6690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006691 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006692 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 &resunicode, newpos)) {
6694 Py_DECREF(restuple);
6695 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006696 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006697 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6698 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6699 Py_DECREF(restuple);
6700 return NULL;
6701 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006702 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006703 *newpos = len + *newpos;
6704 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006705 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 Py_DECREF(restuple);
6707 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006708 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709 Py_INCREF(resunicode);
6710 Py_DECREF(restuple);
6711 return resunicode;
6712}
6713
Alexander Belopolsky40018472011-02-26 01:02:56 +00006714static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006715unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006716 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006717 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006718{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006719 /* input state */
6720 Py_ssize_t pos=0, size;
6721 int kind;
6722 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723 /* pointer into the output */
6724 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006725 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6726 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006727 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006728 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006729 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006730 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006731 /* output object */
6732 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006733
Benjamin Petersonbac79492012-01-14 13:34:47 -05006734 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006735 return NULL;
6736 size = PyUnicode_GET_LENGTH(unicode);
6737 kind = PyUnicode_KIND(unicode);
6738 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006739 /* allocate enough for a simple encoding without
6740 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006741 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006742 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006743
6744 _PyBytesWriter_Init(&writer);
6745 str = _PyBytesWriter_Alloc(&writer, size);
6746 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006747 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006749 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006750 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006751
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006753 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006755 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006756 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006757 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006758 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006759 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006761 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006762 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006764
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006765 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006767
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006768 /* Only overallocate the buffer if it's not the last write */
6769 writer.overallocate = (collend < size);
6770
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006772 if (error_handler == _Py_ERROR_UNKNOWN)
6773 error_handler = get_error_handler(errors);
6774
6775 switch (error_handler) {
6776 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006777 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006779
6780 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006781 memset(str, '?', collend - collstart);
6782 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006783 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006784 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006785 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006786 break;
Victor Stinner50149202015-09-22 00:26:54 +02006787
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006788 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006789 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006790 writer.min_size -= (collend - collstart);
6791 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006792 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006793 if (str == NULL)
6794 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006795 pos = collend;
6796 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006797
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006798 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006799 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006800 writer.min_size -= (collend - collstart);
6801 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006802 unicode, collstart, collend);
6803 if (str == NULL)
6804 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006805 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006806 break;
Victor Stinner50149202015-09-22 00:26:54 +02006807
Victor Stinnerc3713e92015-09-29 12:32:13 +02006808 case _Py_ERROR_SURROGATEESCAPE:
6809 for (i = collstart; i < collend; ++i) {
6810 ch = PyUnicode_READ(kind, data, i);
6811 if (ch < 0xdc80 || 0xdcff < ch) {
6812 /* Not a UTF-8b surrogate */
6813 break;
6814 }
6815 *str++ = (char)(ch - 0xdc00);
6816 ++pos;
6817 }
6818 if (i >= collend)
6819 break;
6820 collstart = pos;
6821 assert(collstart != collend);
6822 /* fallback to general error handling */
6823
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006825 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6826 encoding, reason, unicode, &exc,
6827 collstart, collend, &newpos);
6828 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006830
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006831 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006832 writer.min_size -= 1;
6833
Victor Stinner6bd525b2015-10-09 13:10:05 +02006834 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006835 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006836 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006837 PyBytes_AS_STRING(rep),
6838 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006839 if (str == NULL)
6840 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006841 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006842 else {
6843 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006844
Victor Stinner6bd525b2015-10-09 13:10:05 +02006845 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006847
6848 if (PyUnicode_IS_ASCII(rep)) {
6849 /* Fast path: all characters are smaller than limit */
6850 assert(limit >= 128);
6851 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6852 str = _PyBytesWriter_WriteBytes(&writer, str,
6853 PyUnicode_DATA(rep),
6854 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006856 else {
6857 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6858
6859 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6860 if (str == NULL)
6861 goto onError;
6862
6863 /* check if there is anything unencodable in the
6864 replacement and copy it to the output */
6865 for (i = 0; repsize-->0; ++i, ++str) {
6866 ch = PyUnicode_READ_CHAR(rep, i);
6867 if (ch >= limit) {
6868 raise_encode_exception(&exc, encoding, unicode,
6869 pos, pos+1, reason);
6870 goto onError;
6871 }
6872 *str = (char)ch;
6873 }
6874 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006876 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006877 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006878 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006879
6880 /* If overallocation was disabled, ensure that it was the last
6881 write. Otherwise, we missed an optimization */
6882 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006883 }
6884 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006885
Victor Stinner50149202015-09-22 00:26:54 +02006886 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006887 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006888 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006889
6890 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006891 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006892 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006893 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006894 Py_XDECREF(exc);
6895 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006896}
6897
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006898/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006899PyObject *
6900PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006901 Py_ssize_t size,
6902 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006904 PyObject *result;
6905 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6906 if (unicode == NULL)
6907 return NULL;
6908 result = unicode_encode_ucs1(unicode, errors, 256);
6909 Py_DECREF(unicode);
6910 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911}
6912
Alexander Belopolsky40018472011-02-26 01:02:56 +00006913PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006914_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915{
6916 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 PyErr_BadArgument();
6918 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006920 if (PyUnicode_READY(unicode) == -1)
6921 return NULL;
6922 /* Fast path: if it is a one-byte string, construct
6923 bytes object directly. */
6924 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6925 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6926 PyUnicode_GET_LENGTH(unicode));
6927 /* Non-Latin-1 characters present. Defer to above function to
6928 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006929 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006930}
6931
6932PyObject*
6933PyUnicode_AsLatin1String(PyObject *unicode)
6934{
6935 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936}
6937
6938/* --- 7-bit ASCII Codec -------------------------------------------------- */
6939
Alexander Belopolsky40018472011-02-26 01:02:56 +00006940PyObject *
6941PyUnicode_DecodeASCII(const char *s,
6942 Py_ssize_t size,
6943 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006945 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006946 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006947 int kind;
6948 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006949 Py_ssize_t startinpos;
6950 Py_ssize_t endinpos;
6951 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006952 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006953 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006954 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006955 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006956
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006958 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006959
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006961 if (size == 1 && (unsigned char)s[0] < 128)
6962 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006963
Victor Stinner8f674cc2013-04-17 23:02:17 +02006964 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006965 writer.min_length = size;
6966 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006967 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006968
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006969 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006970 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006971 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006972 writer.pos = outpos;
6973 if (writer.pos == size)
6974 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006975
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006976 s += writer.pos;
6977 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006978 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006979 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006980 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006981 PyUnicode_WRITE(kind, data, writer.pos, c);
6982 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006983 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006984 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006986
6987 /* byte outsize range 0x00..0x7f: call the error handler */
6988
6989 if (error_handler == _Py_ERROR_UNKNOWN)
6990 error_handler = get_error_handler(errors);
6991
6992 switch (error_handler)
6993 {
6994 case _Py_ERROR_REPLACE:
6995 case _Py_ERROR_SURROGATEESCAPE:
6996 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006997 but we may switch to UCS2 at the first write */
6998 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6999 goto onError;
7000 kind = writer.kind;
7001 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007002
7003 if (error_handler == _Py_ERROR_REPLACE)
7004 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7005 else
7006 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7007 writer.pos++;
7008 ++s;
7009 break;
7010
7011 case _Py_ERROR_IGNORE:
7012 ++s;
7013 break;
7014
7015 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 startinpos = s-starts;
7017 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007018 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007019 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 "ascii", "ordinal not in range(128)",
7021 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007022 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007024 kind = writer.kind;
7025 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007028 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007029 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007030 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007031
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007033 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007034 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007035 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036 return NULL;
7037}
7038
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007039/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007040PyObject *
7041PyUnicode_EncodeASCII(const Py_UNICODE *p,
7042 Py_ssize_t size,
7043 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007045 PyObject *result;
7046 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7047 if (unicode == NULL)
7048 return NULL;
7049 result = unicode_encode_ucs1(unicode, errors, 128);
7050 Py_DECREF(unicode);
7051 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052}
7053
Alexander Belopolsky40018472011-02-26 01:02:56 +00007054PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007055_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056{
7057 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007058 PyErr_BadArgument();
7059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007061 if (PyUnicode_READY(unicode) == -1)
7062 return NULL;
7063 /* Fast path: if it is an ASCII-only string, construct bytes object
7064 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007065 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007066 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7067 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007068 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007069}
7070
7071PyObject *
7072PyUnicode_AsASCIIString(PyObject *unicode)
7073{
7074 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075}
7076
Steve Dowercc16be82016-09-08 10:35:16 -07007077#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007078
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007079/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007080
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007081#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007082#define NEED_RETRY
7083#endif
7084
Victor Stinner3a50e702011-10-18 21:21:00 +02007085#ifndef WC_ERR_INVALID_CHARS
7086# define WC_ERR_INVALID_CHARS 0x0080
7087#endif
7088
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007089static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007090code_page_name(UINT code_page, PyObject **obj)
7091{
7092 *obj = NULL;
7093 if (code_page == CP_ACP)
7094 return "mbcs";
7095 if (code_page == CP_UTF7)
7096 return "CP_UTF7";
7097 if (code_page == CP_UTF8)
7098 return "CP_UTF8";
7099
7100 *obj = PyBytes_FromFormat("cp%u", code_page);
7101 if (*obj == NULL)
7102 return NULL;
7103 return PyBytes_AS_STRING(*obj);
7104}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007105
Victor Stinner3a50e702011-10-18 21:21:00 +02007106static DWORD
7107decode_code_page_flags(UINT code_page)
7108{
7109 if (code_page == CP_UTF7) {
7110 /* The CP_UTF7 decoder only supports flags=0 */
7111 return 0;
7112 }
7113 else
7114 return MB_ERR_INVALID_CHARS;
7115}
7116
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007118 * Decode a byte string from a Windows code page into unicode object in strict
7119 * mode.
7120 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007121 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7122 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007123 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007124static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007125decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007126 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007127 const char *in,
7128 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007129{
Victor Stinner3a50e702011-10-18 21:21:00 +02007130 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007131 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007132 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007133
7134 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 assert(insize > 0);
7136 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7137 if (outsize <= 0)
7138 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007139
7140 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007142 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007143 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007144 if (*v == NULL)
7145 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007146 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007147 }
7148 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007149 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007151 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007152 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007153 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007154 }
7155
7156 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007157 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7158 if (outsize <= 0)
7159 goto error;
7160 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007161
Victor Stinner3a50e702011-10-18 21:21:00 +02007162error:
7163 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7164 return -2;
7165 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007166 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007167}
7168
Victor Stinner3a50e702011-10-18 21:21:00 +02007169/*
7170 * Decode a byte string from a code page into unicode object with an error
7171 * handler.
7172 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007173 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 * UnicodeDecodeError exception and returns -1 on error.
7175 */
7176static int
7177decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007178 PyObject **v,
7179 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007180 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007181{
7182 const char *startin = in;
7183 const char *endin = in + size;
7184 const DWORD flags = decode_code_page_flags(code_page);
7185 /* Ideally, we should get reason from FormatMessage. This is the Windows
7186 2000 English version of the message. */
7187 const char *reason = "No mapping for the Unicode character exists "
7188 "in the target code page.";
7189 /* each step cannot decode more than 1 character, but a character can be
7190 represented as a surrogate pair */
7191 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007192 int insize;
7193 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 PyObject *errorHandler = NULL;
7195 PyObject *exc = NULL;
7196 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007197 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 DWORD err;
7199 int ret = -1;
7200
7201 assert(size > 0);
7202
7203 encoding = code_page_name(code_page, &encoding_obj);
7204 if (encoding == NULL)
7205 return -1;
7206
Victor Stinner7d00cc12014-03-17 23:08:06 +01007207 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007208 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7209 UnicodeDecodeError. */
7210 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7211 if (exc != NULL) {
7212 PyCodec_StrictErrors(exc);
7213 Py_CLEAR(exc);
7214 }
7215 goto error;
7216 }
7217
7218 if (*v == NULL) {
7219 /* Create unicode object */
7220 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7221 PyErr_NoMemory();
7222 goto error;
7223 }
Victor Stinnerab595942011-12-17 04:59:06 +01007224 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007225 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007226 if (*v == NULL)
7227 goto error;
7228 startout = PyUnicode_AS_UNICODE(*v);
7229 }
7230 else {
7231 /* Extend unicode object */
7232 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7233 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7234 PyErr_NoMemory();
7235 goto error;
7236 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007237 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007238 goto error;
7239 startout = PyUnicode_AS_UNICODE(*v) + n;
7240 }
7241
7242 /* Decode the byte string character per character */
7243 out = startout;
7244 while (in < endin)
7245 {
7246 /* Decode a character */
7247 insize = 1;
7248 do
7249 {
7250 outsize = MultiByteToWideChar(code_page, flags,
7251 in, insize,
7252 buffer, Py_ARRAY_LENGTH(buffer));
7253 if (outsize > 0)
7254 break;
7255 err = GetLastError();
7256 if (err != ERROR_NO_UNICODE_TRANSLATION
7257 && err != ERROR_INSUFFICIENT_BUFFER)
7258 {
7259 PyErr_SetFromWindowsErr(0);
7260 goto error;
7261 }
7262 insize++;
7263 }
7264 /* 4=maximum length of a UTF-8 sequence */
7265 while (insize <= 4 && (in + insize) <= endin);
7266
7267 if (outsize <= 0) {
7268 Py_ssize_t startinpos, endinpos, outpos;
7269
Victor Stinner7d00cc12014-03-17 23:08:06 +01007270 /* last character in partial decode? */
7271 if (in + insize >= endin && !final)
7272 break;
7273
Victor Stinner3a50e702011-10-18 21:21:00 +02007274 startinpos = in - startin;
7275 endinpos = startinpos + 1;
7276 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007277 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007278 errors, &errorHandler,
7279 encoding, reason,
7280 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007281 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007282 {
7283 goto error;
7284 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007285 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007286 }
7287 else {
7288 in += insize;
7289 memcpy(out, buffer, outsize * sizeof(wchar_t));
7290 out += outsize;
7291 }
7292 }
7293
7294 /* write a NUL character at the end */
7295 *out = 0;
7296
7297 /* Extend unicode object */
7298 outsize = out - startout;
7299 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007300 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007301 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007302 /* (in - startin) <= size and size is an int */
7303 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007304
7305error:
7306 Py_XDECREF(encoding_obj);
7307 Py_XDECREF(errorHandler);
7308 Py_XDECREF(exc);
7309 return ret;
7310}
7311
Victor Stinner3a50e702011-10-18 21:21:00 +02007312static PyObject *
7313decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007314 const char *s, Py_ssize_t size,
7315 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007316{
Victor Stinner76a31a62011-11-04 00:05:13 +01007317 PyObject *v = NULL;
7318 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007319
Victor Stinner3a50e702011-10-18 21:21:00 +02007320 if (code_page < 0) {
7321 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7322 return NULL;
7323 }
7324
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007325 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007326 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007327
Victor Stinner76a31a62011-11-04 00:05:13 +01007328 do
7329 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007330#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007331 if (size > INT_MAX) {
7332 chunk_size = INT_MAX;
7333 final = 0;
7334 done = 0;
7335 }
7336 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007338 {
7339 chunk_size = (int)size;
7340 final = (consumed == NULL);
7341 done = 1;
7342 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007343
Victor Stinner76a31a62011-11-04 00:05:13 +01007344 if (chunk_size == 0 && done) {
7345 if (v != NULL)
7346 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007347 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007348 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007349
Victor Stinner76a31a62011-11-04 00:05:13 +01007350 converted = decode_code_page_strict(code_page, &v,
7351 s, chunk_size);
7352 if (converted == -2)
7353 converted = decode_code_page_errors(code_page, &v,
7354 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007355 errors, final);
7356 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007357
7358 if (converted < 0) {
7359 Py_XDECREF(v);
7360 return NULL;
7361 }
7362
7363 if (consumed)
7364 *consumed += converted;
7365
7366 s += converted;
7367 size -= converted;
7368 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007369
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007370 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007371}
7372
Alexander Belopolsky40018472011-02-26 01:02:56 +00007373PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007374PyUnicode_DecodeCodePageStateful(int code_page,
7375 const char *s,
7376 Py_ssize_t size,
7377 const char *errors,
7378 Py_ssize_t *consumed)
7379{
7380 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7381}
7382
7383PyObject *
7384PyUnicode_DecodeMBCSStateful(const char *s,
7385 Py_ssize_t size,
7386 const char *errors,
7387 Py_ssize_t *consumed)
7388{
7389 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7390}
7391
7392PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007393PyUnicode_DecodeMBCS(const char *s,
7394 Py_ssize_t size,
7395 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007396{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007397 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7398}
7399
Victor Stinner3a50e702011-10-18 21:21:00 +02007400static DWORD
7401encode_code_page_flags(UINT code_page, const char *errors)
7402{
7403 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007404 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007405 }
7406 else if (code_page == CP_UTF7) {
7407 /* CP_UTF7 only supports flags=0 */
7408 return 0;
7409 }
7410 else {
7411 if (errors != NULL && strcmp(errors, "replace") == 0)
7412 return 0;
7413 else
7414 return WC_NO_BEST_FIT_CHARS;
7415 }
7416}
7417
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007418/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007419 * Encode a Unicode string to a Windows code page into a byte string in strict
7420 * mode.
7421 *
7422 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007423 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007424 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007425static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007426encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007427 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007428 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007429{
Victor Stinner554f3f02010-06-16 23:33:54 +00007430 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007431 BOOL *pusedDefaultChar = &usedDefaultChar;
7432 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007433 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007434 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007435 const DWORD flags = encode_code_page_flags(code_page, NULL);
7436 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007437 /* Create a substring so that we can get the UTF-16 representation
7438 of just the slice under consideration. */
7439 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007440
Martin v. Löwis3d325192011-11-04 18:23:06 +01007441 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007442
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007444 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007445 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007446 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007447
Victor Stinner2fc507f2011-11-04 20:06:39 +01007448 substring = PyUnicode_Substring(unicode, offset, offset+len);
7449 if (substring == NULL)
7450 return -1;
7451 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7452 if (p == NULL) {
7453 Py_DECREF(substring);
7454 return -1;
7455 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007456 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007457
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007458 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007460 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 NULL, 0,
7462 NULL, pusedDefaultChar);
7463 if (outsize <= 0)
7464 goto error;
7465 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007466 if (pusedDefaultChar && *pusedDefaultChar) {
7467 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007468 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007469 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007470
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007474 if (*outbytes == NULL) {
7475 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007477 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007479 }
7480 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007481 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007482 const Py_ssize_t n = PyBytes_Size(*outbytes);
7483 if (outsize > PY_SSIZE_T_MAX - n) {
7484 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007485 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007487 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007488 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7489 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007491 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007493 }
7494
7495 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007496 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007497 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007498 out, outsize,
7499 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007500 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 if (outsize <= 0)
7502 goto error;
7503 if (pusedDefaultChar && *pusedDefaultChar)
7504 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007505 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007506
Victor Stinner3a50e702011-10-18 21:21:00 +02007507error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007508 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7510 return -2;
7511 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007512 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007513}
7514
Victor Stinner3a50e702011-10-18 21:21:00 +02007515/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007516 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007517 * error handler.
7518 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007519 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007520 * -1 on other error.
7521 */
7522static int
7523encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007524 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007525 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007526{
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007528 Py_ssize_t pos = unicode_offset;
7529 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007530 /* Ideally, we should get reason from FormatMessage. This is the Windows
7531 2000 English version of the message. */
7532 const char *reason = "invalid character";
7533 /* 4=maximum length of a UTF-8 sequence */
7534 char buffer[4];
7535 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7536 Py_ssize_t outsize;
7537 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007538 PyObject *errorHandler = NULL;
7539 PyObject *exc = NULL;
7540 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007541 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007542 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007543 PyObject *rep;
7544 int ret = -1;
7545
7546 assert(insize > 0);
7547
7548 encoding = code_page_name(code_page, &encoding_obj);
7549 if (encoding == NULL)
7550 return -1;
7551
7552 if (errors == NULL || strcmp(errors, "strict") == 0) {
7553 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7554 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007555 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007556 if (exc != NULL) {
7557 PyCodec_StrictErrors(exc);
7558 Py_DECREF(exc);
7559 }
7560 Py_XDECREF(encoding_obj);
7561 return -1;
7562 }
7563
7564 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7565 pusedDefaultChar = &usedDefaultChar;
7566 else
7567 pusedDefaultChar = NULL;
7568
7569 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7570 PyErr_NoMemory();
7571 goto error;
7572 }
7573 outsize = insize * Py_ARRAY_LENGTH(buffer);
7574
7575 if (*outbytes == NULL) {
7576 /* Create string object */
7577 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7578 if (*outbytes == NULL)
7579 goto error;
7580 out = PyBytes_AS_STRING(*outbytes);
7581 }
7582 else {
7583 /* Extend string object */
7584 Py_ssize_t n = PyBytes_Size(*outbytes);
7585 if (n > PY_SSIZE_T_MAX - outsize) {
7586 PyErr_NoMemory();
7587 goto error;
7588 }
7589 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7590 goto error;
7591 out = PyBytes_AS_STRING(*outbytes) + n;
7592 }
7593
7594 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007595 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007596 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007597 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7598 wchar_t chars[2];
7599 int charsize;
7600 if (ch < 0x10000) {
7601 chars[0] = (wchar_t)ch;
7602 charsize = 1;
7603 }
7604 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007605 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7606 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007607 charsize = 2;
7608 }
7609
Victor Stinner3a50e702011-10-18 21:21:00 +02007610 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007611 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007612 buffer, Py_ARRAY_LENGTH(buffer),
7613 NULL, pusedDefaultChar);
7614 if (outsize > 0) {
7615 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7616 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007617 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007618 memcpy(out, buffer, outsize);
7619 out += outsize;
7620 continue;
7621 }
7622 }
7623 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7624 PyErr_SetFromWindowsErr(0);
7625 goto error;
7626 }
7627
Victor Stinner3a50e702011-10-18 21:21:00 +02007628 rep = unicode_encode_call_errorhandler(
7629 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007630 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007631 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007632 if (rep == NULL)
7633 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007634 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007635
7636 if (PyBytes_Check(rep)) {
7637 outsize = PyBytes_GET_SIZE(rep);
7638 if (outsize != 1) {
7639 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7640 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7641 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7642 Py_DECREF(rep);
7643 goto error;
7644 }
7645 out = PyBytes_AS_STRING(*outbytes) + offset;
7646 }
7647 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7648 out += outsize;
7649 }
7650 else {
7651 Py_ssize_t i;
7652 enum PyUnicode_Kind kind;
7653 void *data;
7654
Benjamin Petersonbac79492012-01-14 13:34:47 -05007655 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007656 Py_DECREF(rep);
7657 goto error;
7658 }
7659
7660 outsize = PyUnicode_GET_LENGTH(rep);
7661 if (outsize != 1) {
7662 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7663 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7664 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7665 Py_DECREF(rep);
7666 goto error;
7667 }
7668 out = PyBytes_AS_STRING(*outbytes) + offset;
7669 }
7670 kind = PyUnicode_KIND(rep);
7671 data = PyUnicode_DATA(rep);
7672 for (i=0; i < outsize; i++) {
7673 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7674 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007675 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007676 encoding, unicode,
7677 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007678 "unable to encode error handler result to ASCII");
7679 Py_DECREF(rep);
7680 goto error;
7681 }
7682 *out = (unsigned char)ch;
7683 out++;
7684 }
7685 }
7686 Py_DECREF(rep);
7687 }
7688 /* write a NUL byte */
7689 *out = 0;
7690 outsize = out - PyBytes_AS_STRING(*outbytes);
7691 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7692 if (_PyBytes_Resize(outbytes, outsize) < 0)
7693 goto error;
7694 ret = 0;
7695
7696error:
7697 Py_XDECREF(encoding_obj);
7698 Py_XDECREF(errorHandler);
7699 Py_XDECREF(exc);
7700 return ret;
7701}
7702
Victor Stinner3a50e702011-10-18 21:21:00 +02007703static PyObject *
7704encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007705 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007706 const char *errors)
7707{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007708 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007709 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007710 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007711 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007712
Victor Stinner29dacf22015-01-26 16:41:32 +01007713 if (!PyUnicode_Check(unicode)) {
7714 PyErr_BadArgument();
7715 return NULL;
7716 }
7717
Benjamin Petersonbac79492012-01-14 13:34:47 -05007718 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007719 return NULL;
7720 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007721
Victor Stinner3a50e702011-10-18 21:21:00 +02007722 if (code_page < 0) {
7723 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7724 return NULL;
7725 }
7726
Martin v. Löwis3d325192011-11-04 18:23:06 +01007727 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007728 return PyBytes_FromStringAndSize(NULL, 0);
7729
Victor Stinner7581cef2011-11-03 22:32:33 +01007730 offset = 0;
7731 do
7732 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007733#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007734 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007735 chunks. */
7736 if (len > INT_MAX/2) {
7737 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007738 done = 0;
7739 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007740 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007741#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007742 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007743 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007744 done = 1;
7745 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007746
Victor Stinner76a31a62011-11-04 00:05:13 +01007747 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007748 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007749 errors);
7750 if (ret == -2)
7751 ret = encode_code_page_errors(code_page, &outbytes,
7752 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007753 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007754 if (ret < 0) {
7755 Py_XDECREF(outbytes);
7756 return NULL;
7757 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007758
Victor Stinner7581cef2011-11-03 22:32:33 +01007759 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007760 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007761 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007762
Victor Stinner3a50e702011-10-18 21:21:00 +02007763 return outbytes;
7764}
7765
7766PyObject *
7767PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7768 Py_ssize_t size,
7769 const char *errors)
7770{
Victor Stinner7581cef2011-11-03 22:32:33 +01007771 PyObject *unicode, *res;
7772 unicode = PyUnicode_FromUnicode(p, size);
7773 if (unicode == NULL)
7774 return NULL;
7775 res = encode_code_page(CP_ACP, unicode, errors);
7776 Py_DECREF(unicode);
7777 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007778}
7779
7780PyObject *
7781PyUnicode_EncodeCodePage(int code_page,
7782 PyObject *unicode,
7783 const char *errors)
7784{
Victor Stinner7581cef2011-11-03 22:32:33 +01007785 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007786}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007787
Alexander Belopolsky40018472011-02-26 01:02:56 +00007788PyObject *
7789PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007790{
Victor Stinner7581cef2011-11-03 22:32:33 +01007791 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007792}
7793
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007794#undef NEED_RETRY
7795
Steve Dowercc16be82016-09-08 10:35:16 -07007796#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007797
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798/* --- Character Mapping Codec -------------------------------------------- */
7799
Victor Stinnerfb161b12013-04-18 01:44:27 +02007800static int
7801charmap_decode_string(const char *s,
7802 Py_ssize_t size,
7803 PyObject *mapping,
7804 const char *errors,
7805 _PyUnicodeWriter *writer)
7806{
7807 const char *starts = s;
7808 const char *e;
7809 Py_ssize_t startinpos, endinpos;
7810 PyObject *errorHandler = NULL, *exc = NULL;
7811 Py_ssize_t maplen;
7812 enum PyUnicode_Kind mapkind;
7813 void *mapdata;
7814 Py_UCS4 x;
7815 unsigned char ch;
7816
7817 if (PyUnicode_READY(mapping) == -1)
7818 return -1;
7819
7820 maplen = PyUnicode_GET_LENGTH(mapping);
7821 mapdata = PyUnicode_DATA(mapping);
7822 mapkind = PyUnicode_KIND(mapping);
7823
7824 e = s + size;
7825
7826 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7827 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7828 * is disabled in encoding aliases, latin1 is preferred because
7829 * its implementation is faster. */
7830 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7831 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7832 Py_UCS4 maxchar = writer->maxchar;
7833
7834 assert (writer->kind == PyUnicode_1BYTE_KIND);
7835 while (s < e) {
7836 ch = *s;
7837 x = mapdata_ucs1[ch];
7838 if (x > maxchar) {
7839 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7840 goto onError;
7841 maxchar = writer->maxchar;
7842 outdata = (Py_UCS1 *)writer->data;
7843 }
7844 outdata[writer->pos] = x;
7845 writer->pos++;
7846 ++s;
7847 }
7848 return 0;
7849 }
7850
7851 while (s < e) {
7852 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7853 enum PyUnicode_Kind outkind = writer->kind;
7854 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7855 if (outkind == PyUnicode_1BYTE_KIND) {
7856 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7857 Py_UCS4 maxchar = writer->maxchar;
7858 while (s < e) {
7859 ch = *s;
7860 x = mapdata_ucs2[ch];
7861 if (x > maxchar)
7862 goto Error;
7863 outdata[writer->pos] = x;
7864 writer->pos++;
7865 ++s;
7866 }
7867 break;
7868 }
7869 else if (outkind == PyUnicode_2BYTE_KIND) {
7870 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7871 while (s < e) {
7872 ch = *s;
7873 x = mapdata_ucs2[ch];
7874 if (x == 0xFFFE)
7875 goto Error;
7876 outdata[writer->pos] = x;
7877 writer->pos++;
7878 ++s;
7879 }
7880 break;
7881 }
7882 }
7883 ch = *s;
7884
7885 if (ch < maplen)
7886 x = PyUnicode_READ(mapkind, mapdata, ch);
7887 else
7888 x = 0xfffe; /* invalid value */
7889Error:
7890 if (x == 0xfffe)
7891 {
7892 /* undefined mapping */
7893 startinpos = s-starts;
7894 endinpos = startinpos+1;
7895 if (unicode_decode_call_errorhandler_writer(
7896 errors, &errorHandler,
7897 "charmap", "character maps to <undefined>",
7898 &starts, &e, &startinpos, &endinpos, &exc, &s,
7899 writer)) {
7900 goto onError;
7901 }
7902 continue;
7903 }
7904
7905 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7906 goto onError;
7907 ++s;
7908 }
7909 Py_XDECREF(errorHandler);
7910 Py_XDECREF(exc);
7911 return 0;
7912
7913onError:
7914 Py_XDECREF(errorHandler);
7915 Py_XDECREF(exc);
7916 return -1;
7917}
7918
7919static int
7920charmap_decode_mapping(const char *s,
7921 Py_ssize_t size,
7922 PyObject *mapping,
7923 const char *errors,
7924 _PyUnicodeWriter *writer)
7925{
7926 const char *starts = s;
7927 const char *e;
7928 Py_ssize_t startinpos, endinpos;
7929 PyObject *errorHandler = NULL, *exc = NULL;
7930 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007931 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007932
7933 e = s + size;
7934
7935 while (s < e) {
7936 ch = *s;
7937
7938 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7939 key = PyLong_FromLong((long)ch);
7940 if (key == NULL)
7941 goto onError;
7942
7943 item = PyObject_GetItem(mapping, key);
7944 Py_DECREF(key);
7945 if (item == NULL) {
7946 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7947 /* No mapping found means: mapping is undefined. */
7948 PyErr_Clear();
7949 goto Undefined;
7950 } else
7951 goto onError;
7952 }
7953
7954 /* Apply mapping */
7955 if (item == Py_None)
7956 goto Undefined;
7957 if (PyLong_Check(item)) {
7958 long value = PyLong_AS_LONG(item);
7959 if (value == 0xFFFE)
7960 goto Undefined;
7961 if (value < 0 || value > MAX_UNICODE) {
7962 PyErr_Format(PyExc_TypeError,
7963 "character mapping must be in range(0x%lx)",
7964 (unsigned long)MAX_UNICODE + 1);
7965 goto onError;
7966 }
7967
7968 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7969 goto onError;
7970 }
7971 else if (PyUnicode_Check(item)) {
7972 if (PyUnicode_READY(item) == -1)
7973 goto onError;
7974 if (PyUnicode_GET_LENGTH(item) == 1) {
7975 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7976 if (value == 0xFFFE)
7977 goto Undefined;
7978 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7979 goto onError;
7980 }
7981 else {
7982 writer->overallocate = 1;
7983 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7984 goto onError;
7985 }
7986 }
7987 else {
7988 /* wrong return value */
7989 PyErr_SetString(PyExc_TypeError,
7990 "character mapping must return integer, None or str");
7991 goto onError;
7992 }
7993 Py_CLEAR(item);
7994 ++s;
7995 continue;
7996
7997Undefined:
7998 /* undefined mapping */
7999 Py_CLEAR(item);
8000 startinpos = s-starts;
8001 endinpos = startinpos+1;
8002 if (unicode_decode_call_errorhandler_writer(
8003 errors, &errorHandler,
8004 "charmap", "character maps to <undefined>",
8005 &starts, &e, &startinpos, &endinpos, &exc, &s,
8006 writer)) {
8007 goto onError;
8008 }
8009 }
8010 Py_XDECREF(errorHandler);
8011 Py_XDECREF(exc);
8012 return 0;
8013
8014onError:
8015 Py_XDECREF(item);
8016 Py_XDECREF(errorHandler);
8017 Py_XDECREF(exc);
8018 return -1;
8019}
8020
Alexander Belopolsky40018472011-02-26 01:02:56 +00008021PyObject *
8022PyUnicode_DecodeCharmap(const char *s,
8023 Py_ssize_t size,
8024 PyObject *mapping,
8025 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008027 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008028
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029 /* Default to Latin-1 */
8030 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008034 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008035 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008036 writer.min_length = size;
8037 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008039
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008040 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008041 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8042 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008043 }
8044 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008045 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8046 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008048 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008049
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008051 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 return NULL;
8053}
8054
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008055/* Charmap encoding: the lookup table */
8056
Alexander Belopolsky40018472011-02-26 01:02:56 +00008057struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 PyObject_HEAD
8059 unsigned char level1[32];
8060 int count2, count3;
8061 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008062};
8063
8064static PyObject*
8065encoding_map_size(PyObject *obj, PyObject* args)
8066{
8067 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008068 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008070}
8071
8072static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008073 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 PyDoc_STR("Return the size (in bytes) of this object") },
8075 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008076};
8077
8078static void
8079encoding_map_dealloc(PyObject* o)
8080{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008081 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008082}
8083
8084static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008085 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 "EncodingMap", /*tp_name*/
8087 sizeof(struct encoding_map), /*tp_basicsize*/
8088 0, /*tp_itemsize*/
8089 /* methods */
8090 encoding_map_dealloc, /*tp_dealloc*/
8091 0, /*tp_print*/
8092 0, /*tp_getattr*/
8093 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008094 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 0, /*tp_repr*/
8096 0, /*tp_as_number*/
8097 0, /*tp_as_sequence*/
8098 0, /*tp_as_mapping*/
8099 0, /*tp_hash*/
8100 0, /*tp_call*/
8101 0, /*tp_str*/
8102 0, /*tp_getattro*/
8103 0, /*tp_setattro*/
8104 0, /*tp_as_buffer*/
8105 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8106 0, /*tp_doc*/
8107 0, /*tp_traverse*/
8108 0, /*tp_clear*/
8109 0, /*tp_richcompare*/
8110 0, /*tp_weaklistoffset*/
8111 0, /*tp_iter*/
8112 0, /*tp_iternext*/
8113 encoding_map_methods, /*tp_methods*/
8114 0, /*tp_members*/
8115 0, /*tp_getset*/
8116 0, /*tp_base*/
8117 0, /*tp_dict*/
8118 0, /*tp_descr_get*/
8119 0, /*tp_descr_set*/
8120 0, /*tp_dictoffset*/
8121 0, /*tp_init*/
8122 0, /*tp_alloc*/
8123 0, /*tp_new*/
8124 0, /*tp_free*/
8125 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008126};
8127
8128PyObject*
8129PyUnicode_BuildEncodingMap(PyObject* string)
8130{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008131 PyObject *result;
8132 struct encoding_map *mresult;
8133 int i;
8134 int need_dict = 0;
8135 unsigned char level1[32];
8136 unsigned char level2[512];
8137 unsigned char *mlevel1, *mlevel2, *mlevel3;
8138 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008139 int kind;
8140 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008141 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008142 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008143
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008144 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008145 PyErr_BadArgument();
8146 return NULL;
8147 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008148 kind = PyUnicode_KIND(string);
8149 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008150 length = PyUnicode_GET_LENGTH(string);
8151 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008152 memset(level1, 0xFF, sizeof level1);
8153 memset(level2, 0xFF, sizeof level2);
8154
8155 /* If there isn't a one-to-one mapping of NULL to \0,
8156 or if there are non-BMP characters, we need to use
8157 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008158 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008159 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008160 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008162 ch = PyUnicode_READ(kind, data, i);
8163 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008164 need_dict = 1;
8165 break;
8166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008167 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008168 /* unmapped character */
8169 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008170 l1 = ch >> 11;
8171 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008172 if (level1[l1] == 0xFF)
8173 level1[l1] = count2++;
8174 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008175 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008176 }
8177
8178 if (count2 >= 0xFF || count3 >= 0xFF)
8179 need_dict = 1;
8180
8181 if (need_dict) {
8182 PyObject *result = PyDict_New();
8183 PyObject *key, *value;
8184 if (!result)
8185 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008186 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008187 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008188 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008189 if (!key || !value)
8190 goto failed1;
8191 if (PyDict_SetItem(result, key, value) == -1)
8192 goto failed1;
8193 Py_DECREF(key);
8194 Py_DECREF(value);
8195 }
8196 return result;
8197 failed1:
8198 Py_XDECREF(key);
8199 Py_XDECREF(value);
8200 Py_DECREF(result);
8201 return NULL;
8202 }
8203
8204 /* Create a three-level trie */
8205 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8206 16*count2 + 128*count3 - 1);
8207 if (!result)
8208 return PyErr_NoMemory();
8209 PyObject_Init(result, &EncodingMapType);
8210 mresult = (struct encoding_map*)result;
8211 mresult->count2 = count2;
8212 mresult->count3 = count3;
8213 mlevel1 = mresult->level1;
8214 mlevel2 = mresult->level23;
8215 mlevel3 = mresult->level23 + 16*count2;
8216 memcpy(mlevel1, level1, 32);
8217 memset(mlevel2, 0xFF, 16*count2);
8218 memset(mlevel3, 0, 128*count3);
8219 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008220 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008221 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008222 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8223 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008224 /* unmapped character */
8225 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008226 o1 = ch>>11;
8227 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008228 i2 = 16*mlevel1[o1] + o2;
8229 if (mlevel2[i2] == 0xFF)
8230 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008231 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008232 i3 = 128*mlevel2[i2] + o3;
8233 mlevel3[i3] = i;
8234 }
8235 return result;
8236}
8237
8238static int
Victor Stinner22168992011-11-20 17:09:18 +01008239encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008240{
8241 struct encoding_map *map = (struct encoding_map*)mapping;
8242 int l1 = c>>11;
8243 int l2 = (c>>7) & 0xF;
8244 int l3 = c & 0x7F;
8245 int i;
8246
Victor Stinner22168992011-11-20 17:09:18 +01008247 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008249 if (c == 0)
8250 return 0;
8251 /* level 1*/
8252 i = map->level1[l1];
8253 if (i == 0xFF) {
8254 return -1;
8255 }
8256 /* level 2*/
8257 i = map->level23[16*i+l2];
8258 if (i == 0xFF) {
8259 return -1;
8260 }
8261 /* level 3 */
8262 i = map->level23[16*map->count2 + 128*i + l3];
8263 if (i == 0) {
8264 return -1;
8265 }
8266 return i;
8267}
8268
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269/* Lookup the character ch in the mapping. If the character
8270 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008271 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008272static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008273charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274{
Christian Heimes217cfd12007-12-02 14:31:20 +00008275 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008276 PyObject *x;
8277
8278 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008280 x = PyObject_GetItem(mapping, w);
8281 Py_DECREF(w);
8282 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8284 /* No mapping found means: mapping is undefined. */
8285 PyErr_Clear();
8286 x = Py_None;
8287 Py_INCREF(x);
8288 return x;
8289 } else
8290 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008292 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008294 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 long value = PyLong_AS_LONG(x);
8296 if (value < 0 || value > 255) {
8297 PyErr_SetString(PyExc_TypeError,
8298 "character mapping must be in range(256)");
8299 Py_DECREF(x);
8300 return NULL;
8301 }
8302 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008304 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 /* wrong return value */
8308 PyErr_Format(PyExc_TypeError,
8309 "character mapping must return integer, bytes or None, not %.400s",
8310 x->ob_type->tp_name);
8311 Py_DECREF(x);
8312 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313 }
8314}
8315
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008316static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008317charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008318{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008319 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8320 /* exponentially overallocate to minimize reallocations */
8321 if (requiredsize < 2*outsize)
8322 requiredsize = 2*outsize;
8323 if (_PyBytes_Resize(outobj, requiredsize))
8324 return -1;
8325 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008326}
8327
Benjamin Peterson14339b62009-01-31 16:36:08 +00008328typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008330} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008332 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333 space is available. Return a new reference to the object that
8334 was put in the output buffer, or Py_None, if the mapping was undefined
8335 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008336 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008337static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008338charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008339 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008340{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008341 PyObject *rep;
8342 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008343 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008344
Christian Heimes90aa7642007-12-19 02:45:37 +00008345 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008346 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008348 if (res == -1)
8349 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 if (outsize<requiredsize)
8351 if (charmapencode_resize(outobj, outpos, requiredsize))
8352 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008353 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 outstart[(*outpos)++] = (char)res;
8355 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008356 }
8357
8358 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008361 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 Py_DECREF(rep);
8363 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008364 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 if (PyLong_Check(rep)) {
8366 Py_ssize_t requiredsize = *outpos+1;
8367 if (outsize<requiredsize)
8368 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8369 Py_DECREF(rep);
8370 return enc_EXCEPTION;
8371 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008372 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008374 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 else {
8376 const char *repchars = PyBytes_AS_STRING(rep);
8377 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8378 Py_ssize_t requiredsize = *outpos+repsize;
8379 if (outsize<requiredsize)
8380 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8381 Py_DECREF(rep);
8382 return enc_EXCEPTION;
8383 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008384 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 memcpy(outstart + *outpos, repchars, repsize);
8386 *outpos += repsize;
8387 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008388 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008389 Py_DECREF(rep);
8390 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391}
8392
8393/* handle an error in PyUnicode_EncodeCharmap
8394 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008395static int
8396charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008397 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008398 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008399 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008400 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401{
8402 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008403 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008404 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008405 enum PyUnicode_Kind kind;
8406 void *data;
8407 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008409 Py_ssize_t collstartpos = *inpos;
8410 Py_ssize_t collendpos = *inpos+1;
8411 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 char *encoding = "charmap";
8413 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008414 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008415 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008416 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417
Benjamin Petersonbac79492012-01-14 13:34:47 -05008418 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008419 return -1;
8420 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421 /* find all unencodable characters */
8422 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008423 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008424 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008425 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008426 val = encoding_map_lookup(ch, mapping);
8427 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 break;
8429 ++collendpos;
8430 continue;
8431 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008433 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8434 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008435 if (rep==NULL)
8436 return -1;
8437 else if (rep!=Py_None) {
8438 Py_DECREF(rep);
8439 break;
8440 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008441 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008443 }
8444 /* cache callback name lookup
8445 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008446 if (*error_handler == _Py_ERROR_UNKNOWN)
8447 *error_handler = get_error_handler(errors);
8448
8449 switch (*error_handler) {
8450 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008451 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008452 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008453
8454 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008455 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 x = charmapencode_output('?', mapping, res, respos);
8457 if (x==enc_EXCEPTION) {
8458 return -1;
8459 }
8460 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008461 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 return -1;
8463 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008464 }
8465 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008466 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008467 *inpos = collendpos;
8468 break;
Victor Stinner50149202015-09-22 00:26:54 +02008469
8470 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008471 /* generate replacement (temporarily (mis)uses p) */
8472 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 char buffer[2+29+1+1];
8474 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008475 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 for (cp = buffer; *cp; ++cp) {
8477 x = charmapencode_output(*cp, mapping, res, respos);
8478 if (x==enc_EXCEPTION)
8479 return -1;
8480 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008481 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 return -1;
8483 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008484 }
8485 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008486 *inpos = collendpos;
8487 break;
Victor Stinner50149202015-09-22 00:26:54 +02008488
Benjamin Peterson14339b62009-01-31 16:36:08 +00008489 default:
Victor Stinner50149202015-09-22 00:26:54 +02008490 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008491 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008493 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008495 if (PyBytes_Check(repunicode)) {
8496 /* Directly copy bytes result to output. */
8497 Py_ssize_t outsize = PyBytes_Size(*res);
8498 Py_ssize_t requiredsize;
8499 repsize = PyBytes_Size(repunicode);
8500 requiredsize = *respos + repsize;
8501 if (requiredsize > outsize)
8502 /* Make room for all additional bytes. */
8503 if (charmapencode_resize(res, respos, requiredsize)) {
8504 Py_DECREF(repunicode);
8505 return -1;
8506 }
8507 memcpy(PyBytes_AsString(*res) + *respos,
8508 PyBytes_AsString(repunicode), repsize);
8509 *respos += repsize;
8510 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008511 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008512 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008513 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008514 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008515 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008516 Py_DECREF(repunicode);
8517 return -1;
8518 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008519 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008520 data = PyUnicode_DATA(repunicode);
8521 kind = PyUnicode_KIND(repunicode);
8522 for (index = 0; index < repsize; index++) {
8523 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8524 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008526 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 return -1;
8528 }
8529 else if (x==enc_FAILED) {
8530 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008531 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 return -1;
8533 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008534 }
8535 *inpos = newpos;
8536 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 }
8538 return 0;
8539}
8540
Alexander Belopolsky40018472011-02-26 01:02:56 +00008541PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008542_PyUnicode_EncodeCharmap(PyObject *unicode,
8543 PyObject *mapping,
8544 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 /* output object */
8547 PyObject *res = NULL;
8548 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008549 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008550 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008551 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008552 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008553 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008554 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008555 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008556 void *data;
8557 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558
Benjamin Petersonbac79492012-01-14 13:34:47 -05008559 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008560 return NULL;
8561 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008562 data = PyUnicode_DATA(unicode);
8563 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008564
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565 /* Default to Latin-1 */
8566 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008567 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008569 /* allocate enough for a simple encoding without
8570 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008571 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008572 if (res == NULL)
8573 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008574 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008577 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008578 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008580 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 if (x==enc_EXCEPTION) /* error */
8582 goto onError;
8583 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008584 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008585 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008586 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 &res, &respos)) {
8588 goto onError;
8589 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008590 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 else
8592 /* done with this character => adjust input position */
8593 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008596 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008597 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008598 if (_PyBytes_Resize(&res, respos) < 0)
8599 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008600
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008601 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008602 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008603 return res;
8604
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008606 Py_XDECREF(res);
8607 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008608 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609 return NULL;
8610}
8611
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008612/* Deprecated */
8613PyObject *
8614PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8615 Py_ssize_t size,
8616 PyObject *mapping,
8617 const char *errors)
8618{
8619 PyObject *result;
8620 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8621 if (unicode == NULL)
8622 return NULL;
8623 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8624 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008625 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008626}
8627
Alexander Belopolsky40018472011-02-26 01:02:56 +00008628PyObject *
8629PyUnicode_AsCharmapString(PyObject *unicode,
8630 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631{
8632 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 PyErr_BadArgument();
8634 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008636 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637}
8638
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008639/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008640static void
8641make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008642 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008643 Py_ssize_t startpos, Py_ssize_t endpos,
8644 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008646 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647 *exceptionObject = _PyUnicodeTranslateError_Create(
8648 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649 }
8650 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8652 goto onError;
8653 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8654 goto onError;
8655 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8656 goto onError;
8657 return;
8658 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008659 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660 }
8661}
8662
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663/* error handling callback helper:
8664 build arguments, call the callback and check the arguments,
8665 put the result into newpos and return the replacement string, which
8666 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008667static PyObject *
8668unicode_translate_call_errorhandler(const char *errors,
8669 PyObject **errorHandler,
8670 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008672 Py_ssize_t startpos, Py_ssize_t endpos,
8673 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02008675 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008677 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008678 PyObject *restuple;
8679 PyObject *resunicode;
8680
8681 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 }
8686
8687 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008689 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008691
8692 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008694 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008697 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 Py_DECREF(restuple);
8699 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008700 }
8701 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 &resunicode, &i_newpos)) {
8703 Py_DECREF(restuple);
8704 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008705 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008706 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008708 else
8709 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008711 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 Py_DECREF(restuple);
8713 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008714 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008715 Py_INCREF(resunicode);
8716 Py_DECREF(restuple);
8717 return resunicode;
8718}
8719
8720/* Lookup the character ch in the mapping and put the result in result,
8721 which must be decrefed by the caller.
8722 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008723static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008725{
Christian Heimes217cfd12007-12-02 14:31:20 +00008726 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008727 PyObject *x;
8728
8729 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008731 x = PyObject_GetItem(mapping, w);
8732 Py_DECREF(w);
8733 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8735 /* No mapping found means: use 1:1 mapping. */
8736 PyErr_Clear();
8737 *result = NULL;
8738 return 0;
8739 } else
8740 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008741 }
8742 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 *result = x;
8744 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008745 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008746 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008748 if (value < 0 || value > MAX_UNICODE) {
8749 PyErr_Format(PyExc_ValueError,
8750 "character mapping must be in range(0x%x)",
8751 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 Py_DECREF(x);
8753 return -1;
8754 }
8755 *result = x;
8756 return 0;
8757 }
8758 else if (PyUnicode_Check(x)) {
8759 *result = x;
8760 return 0;
8761 }
8762 else {
8763 /* wrong return value */
8764 PyErr_SetString(PyExc_TypeError,
8765 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008766 Py_DECREF(x);
8767 return -1;
8768 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008769}
Victor Stinner1194ea02014-04-04 19:37:40 +02008770
8771/* lookup the character, write the result into the writer.
8772 Return 1 if the result was written into the writer, return 0 if the mapping
8773 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008774static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008775charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8776 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008777{
Victor Stinner1194ea02014-04-04 19:37:40 +02008778 PyObject *item;
8779
8780 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008781 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008782
8783 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008785 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008788 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008789 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008790
8791 if (item == Py_None) {
8792 Py_DECREF(item);
8793 return 0;
8794 }
8795
8796 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008797 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8798 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8799 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008800 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8801 Py_DECREF(item);
8802 return -1;
8803 }
8804 Py_DECREF(item);
8805 return 1;
8806 }
8807
8808 if (!PyUnicode_Check(item)) {
8809 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008810 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008811 }
8812
8813 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8814 Py_DECREF(item);
8815 return -1;
8816 }
8817
8818 Py_DECREF(item);
8819 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008820}
8821
Victor Stinner89a76ab2014-04-05 11:44:04 +02008822static int
8823unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8824 Py_UCS1 *translate)
8825{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008826 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008827 int ret = 0;
8828
Victor Stinner89a76ab2014-04-05 11:44:04 +02008829 if (charmaptranslate_lookup(ch, mapping, &item)) {
8830 return -1;
8831 }
8832
8833 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008834 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008835 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008836 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008837 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008838 /* not found => default to 1:1 mapping */
8839 translate[ch] = ch;
8840 return 1;
8841 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008842 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008843 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008844 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8845 used it */
8846 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008847 /* invalid character or character outside ASCII:
8848 skip the fast translate */
8849 goto exit;
8850 }
8851 translate[ch] = (Py_UCS1)replace;
8852 }
8853 else if (PyUnicode_Check(item)) {
8854 Py_UCS4 replace;
8855
8856 if (PyUnicode_READY(item) == -1) {
8857 Py_DECREF(item);
8858 return -1;
8859 }
8860 if (PyUnicode_GET_LENGTH(item) != 1)
8861 goto exit;
8862
8863 replace = PyUnicode_READ_CHAR(item, 0);
8864 if (replace > 127)
8865 goto exit;
8866 translate[ch] = (Py_UCS1)replace;
8867 }
8868 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008869 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008870 goto exit;
8871 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008872 ret = 1;
8873
Benjamin Peterson1365de72014-04-07 20:15:41 -04008874 exit:
8875 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008876 return ret;
8877}
8878
8879/* Fast path for ascii => ascii translation. Return 1 if the whole string
8880 was translated into writer, return 0 if the input string was partially
8881 translated into writer, raise an exception and return -1 on error. */
8882static int
8883unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008884 _PyUnicodeWriter *writer, int ignore,
8885 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008886{
Victor Stinner872b2912014-04-05 14:27:07 +02008887 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008888 Py_ssize_t len;
8889 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008890 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008891
Victor Stinner89a76ab2014-04-05 11:44:04 +02008892 len = PyUnicode_GET_LENGTH(input);
8893
Victor Stinner872b2912014-04-05 14:27:07 +02008894 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008895
8896 in = PyUnicode_1BYTE_DATA(input);
8897 end = in + len;
8898
8899 assert(PyUnicode_IS_ASCII(writer->buffer));
8900 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8901 out = PyUnicode_1BYTE_DATA(writer->buffer);
8902
Victor Stinner872b2912014-04-05 14:27:07 +02008903 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008904 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008905 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008906 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008907 int translate = unicode_fast_translate_lookup(mapping, ch,
8908 ascii_table);
8909 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008910 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008911 if (translate == 0)
8912 goto exit;
8913 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008914 }
Victor Stinner872b2912014-04-05 14:27:07 +02008915 if (ch2 == 0xfe) {
8916 if (ignore)
8917 continue;
8918 goto exit;
8919 }
8920 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008921 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008922 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008923 }
Victor Stinner872b2912014-04-05 14:27:07 +02008924 res = 1;
8925
8926exit:
8927 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008928 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008929 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008930}
8931
Victor Stinner3222da22015-10-01 22:07:32 +02008932static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933_PyUnicode_TranslateCharmap(PyObject *input,
8934 PyObject *mapping,
8935 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008938 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008939 Py_ssize_t size, i;
8940 int kind;
8941 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008942 _PyUnicodeWriter writer;
8943 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008944 char *reason = "character maps to <undefined>";
8945 PyObject *errorHandler = NULL;
8946 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008947 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008948 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008949
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008951 PyErr_BadArgument();
8952 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008953 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955 if (PyUnicode_READY(input) == -1)
8956 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008957 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 kind = PyUnicode_KIND(input);
8959 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008961 if (size == 0)
8962 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008963
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008964 /* allocate enough for a simple 1:1 translation without
8965 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008966 _PyUnicodeWriter_Init(&writer);
8967 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008968 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969
Victor Stinner872b2912014-04-05 14:27:07 +02008970 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8971
Victor Stinner33798672016-03-01 21:59:58 +01008972 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008973 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008974 if (PyUnicode_IS_ASCII(input)) {
8975 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8976 if (res < 0) {
8977 _PyUnicodeWriter_Dealloc(&writer);
8978 return NULL;
8979 }
8980 if (res == 1)
8981 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008982 }
Victor Stinner33798672016-03-01 21:59:58 +01008983 else {
8984 i = 0;
8985 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008988 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008989 int translate;
8990 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8991 Py_ssize_t newpos;
8992 /* startpos for collecting untranslatable chars */
8993 Py_ssize_t collstart;
8994 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008995 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996
Victor Stinner1194ea02014-04-04 19:37:40 +02008997 ch = PyUnicode_READ(kind, data, i);
8998 translate = charmaptranslate_output(ch, mapping, &writer);
8999 if (translate < 0)
9000 goto onError;
9001
9002 if (translate != 0) {
9003 /* it worked => adjust input pointer */
9004 ++i;
9005 continue;
9006 }
9007
9008 /* untranslatable character */
9009 collstart = i;
9010 collend = i+1;
9011
9012 /* find all untranslatable characters */
9013 while (collend < size) {
9014 PyObject *x;
9015 ch = PyUnicode_READ(kind, data, collend);
9016 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009017 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009018 Py_XDECREF(x);
9019 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009020 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009021 ++collend;
9022 }
9023
9024 if (ignore) {
9025 i = collend;
9026 }
9027 else {
9028 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9029 reason, input, &exc,
9030 collstart, collend, &newpos);
9031 if (repunicode == NULL)
9032 goto onError;
9033 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009034 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009035 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009036 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009037 Py_DECREF(repunicode);
9038 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009039 }
9040 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009041 Py_XDECREF(exc);
9042 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009043 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044
Benjamin Peterson29060642009-01-31 22:14:21 +00009045 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009046 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009047 Py_XDECREF(exc);
9048 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049 return NULL;
9050}
9051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052/* Deprecated. Use PyUnicode_Translate instead. */
9053PyObject *
9054PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9055 Py_ssize_t size,
9056 PyObject *mapping,
9057 const char *errors)
9058{
Christian Heimes5f520f42012-09-11 14:03:25 +02009059 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 PyObject *unicode = PyUnicode_FromUnicode(p, size);
9061 if (!unicode)
9062 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009063 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9064 Py_DECREF(unicode);
9065 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066}
9067
Alexander Belopolsky40018472011-02-26 01:02:56 +00009068PyObject *
9069PyUnicode_Translate(PyObject *str,
9070 PyObject *mapping,
9071 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009073 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009074 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009075 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076}
Tim Petersced69f82003-09-16 20:30:58 +00009077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009079fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080{
9081 /* No need to call PyUnicode_READY(self) because this function is only
9082 called as a callback from fixup() which does it already. */
9083 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9084 const int kind = PyUnicode_KIND(self);
9085 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009086 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009087 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 Py_ssize_t i;
9089
9090 for (i = 0; i < len; ++i) {
9091 ch = PyUnicode_READ(kind, data, i);
9092 fixed = 0;
9093 if (ch > 127) {
9094 if (Py_UNICODE_ISSPACE(ch))
9095 fixed = ' ';
9096 else {
9097 const int decimal = Py_UNICODE_TODECIMAL(ch);
9098 if (decimal >= 0)
9099 fixed = '0' + decimal;
9100 }
9101 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009102 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009103 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104 PyUnicode_WRITE(kind, data, i, fixed);
9105 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009106 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009107 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009108 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009109 }
9110
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009111 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112}
9113
9114PyObject *
9115_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9116{
9117 if (!PyUnicode_Check(unicode)) {
9118 PyErr_BadInternalCall();
9119 return NULL;
9120 }
9121 if (PyUnicode_READY(unicode) == -1)
9122 return NULL;
9123 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9124 /* If the string is already ASCII, just return the same string */
9125 Py_INCREF(unicode);
9126 return unicode;
9127 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009128 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009129}
9130
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009131PyObject *
9132PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9133 Py_ssize_t length)
9134{
Victor Stinnerf0124502011-11-21 23:12:56 +01009135 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009136 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009137 Py_UCS4 maxchar;
9138 enum PyUnicode_Kind kind;
9139 void *data;
9140
Victor Stinner99d7ad02012-02-22 13:37:39 +01009141 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009142 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009143 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009144 if (ch > 127) {
9145 int decimal = Py_UNICODE_TODECIMAL(ch);
9146 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009147 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009148 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009149 }
9150 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009151
9152 /* Copy to a new string */
9153 decimal = PyUnicode_New(length, maxchar);
9154 if (decimal == NULL)
9155 return decimal;
9156 kind = PyUnicode_KIND(decimal);
9157 data = PyUnicode_DATA(decimal);
9158 /* Iterate over code points */
9159 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009160 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009161 if (ch > 127) {
9162 int decimal = Py_UNICODE_TODECIMAL(ch);
9163 if (decimal >= 0)
9164 ch = '0' + decimal;
9165 }
9166 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009168 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009169}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009170/* --- Decimal Encoder ---------------------------------------------------- */
9171
Alexander Belopolsky40018472011-02-26 01:02:56 +00009172int
9173PyUnicode_EncodeDecimal(Py_UNICODE *s,
9174 Py_ssize_t length,
9175 char *output,
9176 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009177{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009178 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009179 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009180 enum PyUnicode_Kind kind;
9181 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009182
9183 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009184 PyErr_BadArgument();
9185 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009186 }
9187
Victor Stinner42bf7752011-11-21 22:52:58 +01009188 unicode = PyUnicode_FromUnicode(s, length);
9189 if (unicode == NULL)
9190 return -1;
9191
Benjamin Petersonbac79492012-01-14 13:34:47 -05009192 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009193 Py_DECREF(unicode);
9194 return -1;
9195 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009196 kind = PyUnicode_KIND(unicode);
9197 data = PyUnicode_DATA(unicode);
9198
Victor Stinnerb84d7232011-11-22 01:50:07 +01009199 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009200 PyObject *exc;
9201 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009202 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009203 Py_ssize_t startpos;
9204
9205 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009206
Benjamin Peterson29060642009-01-31 22:14:21 +00009207 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009208 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009209 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009210 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009211 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009212 decimal = Py_UNICODE_TODECIMAL(ch);
9213 if (decimal >= 0) {
9214 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009215 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009216 continue;
9217 }
9218 if (0 < ch && ch < 256) {
9219 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009220 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009221 continue;
9222 }
Victor Stinner6345be92011-11-25 20:09:01 +01009223
Victor Stinner42bf7752011-11-21 22:52:58 +01009224 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009225 exc = NULL;
9226 raise_encode_exception(&exc, "decimal", unicode,
9227 startpos, startpos+1,
9228 "invalid decimal Unicode string");
9229 Py_XDECREF(exc);
9230 Py_DECREF(unicode);
9231 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009232 }
9233 /* 0-terminate the output string */
9234 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009235 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009236 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009237}
9238
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239/* --- Helpers ------------------------------------------------------------ */
9240
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009241/* helper macro to fixup start/end slice values */
9242#define ADJUST_INDICES(start, end, len) \
9243 if (end > len) \
9244 end = len; \
9245 else if (end < 0) { \
9246 end += len; \
9247 if (end < 0) \
9248 end = 0; \
9249 } \
9250 if (start < 0) { \
9251 start += len; \
9252 if (start < 0) \
9253 start = 0; \
9254 }
9255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009257any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009259 Py_ssize_t end,
9260 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009262 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 void *buf1, *buf2;
9264 Py_ssize_t len1, len2, result;
9265
9266 kind1 = PyUnicode_KIND(s1);
9267 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009268 if (kind1 < kind2)
9269 return -1;
9270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 len1 = PyUnicode_GET_LENGTH(s1);
9272 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009273 ADJUST_INDICES(start, end, len1);
9274 if (end - start < len2)
9275 return -1;
9276
9277 buf1 = PyUnicode_DATA(s1);
9278 buf2 = PyUnicode_DATA(s2);
9279 if (len2 == 1) {
9280 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9281 result = findchar((const char *)buf1 + kind1*start,
9282 kind1, end - start, ch, direction);
9283 if (result == -1)
9284 return -1;
9285 else
9286 return start + result;
9287 }
9288
9289 if (kind2 != kind1) {
9290 buf2 = _PyUnicode_AsKind(s2, kind1);
9291 if (!buf2)
9292 return -2;
9293 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294
Victor Stinner794d5672011-10-10 03:21:36 +02009295 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009296 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009297 case PyUnicode_1BYTE_KIND:
9298 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9299 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9300 else
9301 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9302 break;
9303 case PyUnicode_2BYTE_KIND:
9304 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9305 break;
9306 case PyUnicode_4BYTE_KIND:
9307 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9308 break;
9309 default:
9310 assert(0); result = -2;
9311 }
9312 }
9313 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009314 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009315 case PyUnicode_1BYTE_KIND:
9316 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9317 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9318 else
9319 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9320 break;
9321 case PyUnicode_2BYTE_KIND:
9322 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9323 break;
9324 case PyUnicode_4BYTE_KIND:
9325 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9326 break;
9327 default:
9328 assert(0); result = -2;
9329 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330 }
9331
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009332 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333 PyMem_Free(buf2);
9334
9335 return result;
9336}
9337
9338Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009339_PyUnicode_InsertThousandsGrouping(
9340 PyObject *unicode, Py_ssize_t index,
9341 Py_ssize_t n_buffer,
9342 void *digits, Py_ssize_t n_digits,
9343 Py_ssize_t min_width,
9344 const char *grouping, PyObject *thousands_sep,
9345 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346{
Victor Stinner41a863c2012-02-24 00:37:51 +01009347 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009348 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009349 Py_ssize_t thousands_sep_len;
9350 Py_ssize_t len;
9351
9352 if (unicode != NULL) {
9353 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009354 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009355 }
9356 else {
9357 kind = PyUnicode_1BYTE_KIND;
9358 data = NULL;
9359 }
9360 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9361 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9362 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9363 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009364 if (thousands_sep_kind < kind) {
9365 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9366 if (!thousands_sep_data)
9367 return -1;
9368 }
9369 else {
9370 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9371 if (!data)
9372 return -1;
9373 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009374 }
9375
Benjamin Petersonead6b532011-12-20 17:23:42 -06009376 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009378 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009379 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009380 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009381 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009382 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009383 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009384 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009385 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009386 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009387 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009388 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009390 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009391 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009392 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009393 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009394 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009396 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009397 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009398 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009399 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009400 break;
9401 default:
9402 assert(0);
9403 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009405 if (unicode != NULL && thousands_sep_kind != kind) {
9406 if (thousands_sep_kind < kind)
9407 PyMem_Free(thousands_sep_data);
9408 else
9409 PyMem_Free(data);
9410 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009411 if (unicode == NULL) {
9412 *maxchar = 127;
9413 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009414 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009415 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009416 }
9417 }
9418 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419}
9420
9421
Alexander Belopolsky40018472011-02-26 01:02:56 +00009422Py_ssize_t
9423PyUnicode_Count(PyObject *str,
9424 PyObject *substr,
9425 Py_ssize_t start,
9426 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009427{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009428 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009429 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 void *buf1 = NULL, *buf2 = NULL;
9431 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009432
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009433 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009434 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009435
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009436 kind1 = PyUnicode_KIND(str);
9437 kind2 = PyUnicode_KIND(substr);
9438 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009439 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009440
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009441 len1 = PyUnicode_GET_LENGTH(str);
9442 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009444 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009445 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009446
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009447 buf1 = PyUnicode_DATA(str);
9448 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009449 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009450 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009451 if (!buf2)
9452 goto onError;
9453 }
9454
9455 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009457 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009458 result = asciilib_count(
9459 ((Py_UCS1*)buf1) + start, end - start,
9460 buf2, len2, PY_SSIZE_T_MAX
9461 );
9462 else
9463 result = ucs1lib_count(
9464 ((Py_UCS1*)buf1) + start, end - start,
9465 buf2, len2, PY_SSIZE_T_MAX
9466 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 break;
9468 case PyUnicode_2BYTE_KIND:
9469 result = ucs2lib_count(
9470 ((Py_UCS2*)buf1) + start, end - start,
9471 buf2, len2, PY_SSIZE_T_MAX
9472 );
9473 break;
9474 case PyUnicode_4BYTE_KIND:
9475 result = ucs4lib_count(
9476 ((Py_UCS4*)buf1) + start, end - start,
9477 buf2, len2, PY_SSIZE_T_MAX
9478 );
9479 break;
9480 default:
9481 assert(0); result = 0;
9482 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009483
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009484 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 PyMem_Free(buf2);
9486
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009489 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 PyMem_Free(buf2);
9491 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492}
9493
Alexander Belopolsky40018472011-02-26 01:02:56 +00009494Py_ssize_t
9495PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009496 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009497 Py_ssize_t start,
9498 Py_ssize_t end,
9499 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009501 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009502 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009503
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009504 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009505}
9506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507Py_ssize_t
9508PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9509 Py_ssize_t start, Py_ssize_t end,
9510 int direction)
9511{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009513 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 if (PyUnicode_READY(str) == -1)
9515 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009516 if (start < 0 || end < 0) {
9517 PyErr_SetString(PyExc_IndexError, "string index out of range");
9518 return -2;
9519 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520 if (end > PyUnicode_GET_LENGTH(str))
9521 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009522 if (start >= end)
9523 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009525 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9526 kind, end-start, ch, direction);
9527 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009528 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009529 else
9530 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531}
9532
Alexander Belopolsky40018472011-02-26 01:02:56 +00009533static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009534tailmatch(PyObject *self,
9535 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009536 Py_ssize_t start,
9537 Py_ssize_t end,
9538 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 int kind_self;
9541 int kind_sub;
9542 void *data_self;
9543 void *data_sub;
9544 Py_ssize_t offset;
9545 Py_ssize_t i;
9546 Py_ssize_t end_sub;
9547
9548 if (PyUnicode_READY(self) == -1 ||
9549 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009550 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9553 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009554 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009555 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009557 if (PyUnicode_GET_LENGTH(substring) == 0)
9558 return 1;
9559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 kind_self = PyUnicode_KIND(self);
9561 data_self = PyUnicode_DATA(self);
9562 kind_sub = PyUnicode_KIND(substring);
9563 data_sub = PyUnicode_DATA(substring);
9564 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9565
9566 if (direction > 0)
9567 offset = end;
9568 else
9569 offset = start;
9570
9571 if (PyUnicode_READ(kind_self, data_self, offset) ==
9572 PyUnicode_READ(kind_sub, data_sub, 0) &&
9573 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9574 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9575 /* If both are of the same kind, memcmp is sufficient */
9576 if (kind_self == kind_sub) {
9577 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009578 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579 data_sub,
9580 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009581 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009583 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 else {
9585 /* We do not need to compare 0 and len(substring)-1 because
9586 the if statement above ensured already that they are equal
9587 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588 for (i = 1; i < end_sub; ++i) {
9589 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9590 PyUnicode_READ(kind_sub, data_sub, i))
9591 return 0;
9592 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009593 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595 }
9596
9597 return 0;
9598}
9599
Alexander Belopolsky40018472011-02-26 01:02:56 +00009600Py_ssize_t
9601PyUnicode_Tailmatch(PyObject *str,
9602 PyObject *substr,
9603 Py_ssize_t start,
9604 Py_ssize_t end,
9605 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009606{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009607 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009608 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009609
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009610 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611}
9612
Guido van Rossumd57fd912000-03-10 22:53:23 +00009613/* Apply fixfct filter to the Unicode object self and return a
9614 reference to the modified object */
9615
Alexander Belopolsky40018472011-02-26 01:02:56 +00009616static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009617fixup(PyObject *self,
9618 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009619{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620 PyObject *u;
9621 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009622 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009624 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009625 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009626 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009627 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 /* fix functions return the new maximum character in a string,
9630 if the kind of the resulting unicode object does not change,
9631 everything is fine. Otherwise we need to change the string kind
9632 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009633 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009634
9635 if (maxchar_new == 0) {
9636 /* no changes */;
9637 if (PyUnicode_CheckExact(self)) {
9638 Py_DECREF(u);
9639 Py_INCREF(self);
9640 return self;
9641 }
9642 else
9643 return u;
9644 }
9645
Victor Stinnere6abb482012-05-02 01:15:40 +02009646 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647
Victor Stinnereaab6042011-12-11 22:22:39 +01009648 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009650
9651 /* In case the maximum character changed, we need to
9652 convert the string to the new category. */
9653 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9654 if (v == NULL) {
9655 Py_DECREF(u);
9656 return NULL;
9657 }
9658 if (maxchar_new > maxchar_old) {
9659 /* If the maxchar increased so that the kind changed, not all
9660 characters are representable anymore and we need to fix the
9661 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009662 _PyUnicode_FastCopyCharacters(v, 0,
9663 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009664 maxchar_old = fixfct(v);
9665 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009666 }
9667 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009668 _PyUnicode_FastCopyCharacters(v, 0,
9669 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009670 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009671 Py_DECREF(u);
9672 assert(_PyUnicode_CheckConsistency(v, 1));
9673 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674}
9675
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009676static PyObject *
9677ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009678{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009679 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9680 char *resdata, *data = PyUnicode_DATA(self);
9681 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009682
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009683 res = PyUnicode_New(len, 127);
9684 if (res == NULL)
9685 return NULL;
9686 resdata = PyUnicode_DATA(res);
9687 if (lower)
9688 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009689 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009690 _Py_bytes_upper(resdata, data, len);
9691 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692}
9693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009695handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009697 Py_ssize_t j;
9698 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009699 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009700 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009701
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009702 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9703
9704 where ! is a negation and \p{xxx} is a character with property xxx.
9705 */
9706 for (j = i - 1; j >= 0; j--) {
9707 c = PyUnicode_READ(kind, data, j);
9708 if (!_PyUnicode_IsCaseIgnorable(c))
9709 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9712 if (final_sigma) {
9713 for (j = i + 1; j < length; j++) {
9714 c = PyUnicode_READ(kind, data, j);
9715 if (!_PyUnicode_IsCaseIgnorable(c))
9716 break;
9717 }
9718 final_sigma = j == length || !_PyUnicode_IsCased(c);
9719 }
9720 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721}
9722
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009723static int
9724lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9725 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009727 /* Obscure special case. */
9728 if (c == 0x3A3) {
9729 mapped[0] = handle_capital_sigma(kind, data, length, i);
9730 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009732 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009733}
9734
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009735static Py_ssize_t
9736do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009737{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009738 Py_ssize_t i, k = 0;
9739 int n_res, j;
9740 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009741
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009742 c = PyUnicode_READ(kind, data, 0);
9743 n_res = _PyUnicode_ToUpperFull(c, mapped);
9744 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009745 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009746 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009748 for (i = 1; i < length; i++) {
9749 c = PyUnicode_READ(kind, data, i);
9750 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9751 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009752 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009753 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009754 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009755 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009756 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009757}
9758
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009759static Py_ssize_t
9760do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9761 Py_ssize_t i, k = 0;
9762
9763 for (i = 0; i < length; i++) {
9764 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9765 int n_res, j;
9766 if (Py_UNICODE_ISUPPER(c)) {
9767 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9768 }
9769 else if (Py_UNICODE_ISLOWER(c)) {
9770 n_res = _PyUnicode_ToUpperFull(c, mapped);
9771 }
9772 else {
9773 n_res = 1;
9774 mapped[0] = c;
9775 }
9776 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009777 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009778 res[k++] = mapped[j];
9779 }
9780 }
9781 return k;
9782}
9783
9784static Py_ssize_t
9785do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9786 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009788 Py_ssize_t i, k = 0;
9789
9790 for (i = 0; i < length; i++) {
9791 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9792 int n_res, j;
9793 if (lower)
9794 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9795 else
9796 n_res = _PyUnicode_ToUpperFull(c, mapped);
9797 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009798 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009799 res[k++] = mapped[j];
9800 }
9801 }
9802 return k;
9803}
9804
9805static Py_ssize_t
9806do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9807{
9808 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9809}
9810
9811static Py_ssize_t
9812do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9813{
9814 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9815}
9816
Benjamin Petersone51757f2012-01-12 21:10:29 -05009817static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009818do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9819{
9820 Py_ssize_t i, k = 0;
9821
9822 for (i = 0; i < length; i++) {
9823 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9824 Py_UCS4 mapped[3];
9825 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9826 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009827 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009828 res[k++] = mapped[j];
9829 }
9830 }
9831 return k;
9832}
9833
9834static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009835do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9836{
9837 Py_ssize_t i, k = 0;
9838 int previous_is_cased;
9839
9840 previous_is_cased = 0;
9841 for (i = 0; i < length; i++) {
9842 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9843 Py_UCS4 mapped[3];
9844 int n_res, j;
9845
9846 if (previous_is_cased)
9847 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9848 else
9849 n_res = _PyUnicode_ToTitleFull(c, mapped);
9850
9851 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009852 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009853 res[k++] = mapped[j];
9854 }
9855
9856 previous_is_cased = _PyUnicode_IsCased(c);
9857 }
9858 return k;
9859}
9860
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009861static PyObject *
9862case_operation(PyObject *self,
9863 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9864{
9865 PyObject *res = NULL;
9866 Py_ssize_t length, newlength = 0;
9867 int kind, outkind;
9868 void *data, *outdata;
9869 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9870
Benjamin Petersoneea48462012-01-16 14:28:50 -05009871 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009872
9873 kind = PyUnicode_KIND(self);
9874 data = PyUnicode_DATA(self);
9875 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009876 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009877 PyErr_SetString(PyExc_OverflowError, "string is too long");
9878 return NULL;
9879 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009880 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009881 if (tmp == NULL)
9882 return PyErr_NoMemory();
9883 newlength = perform(kind, data, length, tmp, &maxchar);
9884 res = PyUnicode_New(newlength, maxchar);
9885 if (res == NULL)
9886 goto leave;
9887 tmpend = tmp + newlength;
9888 outdata = PyUnicode_DATA(res);
9889 outkind = PyUnicode_KIND(res);
9890 switch (outkind) {
9891 case PyUnicode_1BYTE_KIND:
9892 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9893 break;
9894 case PyUnicode_2BYTE_KIND:
9895 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9896 break;
9897 case PyUnicode_4BYTE_KIND:
9898 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9899 break;
9900 default:
9901 assert(0);
9902 break;
9903 }
9904 leave:
9905 PyMem_FREE(tmp);
9906 return res;
9907}
9908
Tim Peters8ce9f162004-08-27 01:49:32 +00009909PyObject *
9910PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009912 PyObject *res;
9913 PyObject *fseq;
9914 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009915 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009916
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009917 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009918 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009919 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009920 }
9921
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009922 /* NOTE: the following code can't call back into Python code,
9923 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009924 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009925
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009926 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009927 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009928 res = _PyUnicode_JoinArray(separator, items, seqlen);
9929 Py_DECREF(fseq);
9930 return res;
9931}
9932
9933PyObject *
9934_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9935{
9936 PyObject *res = NULL; /* the result */
9937 PyObject *sep = NULL;
9938 Py_ssize_t seplen;
9939 PyObject *item;
9940 Py_ssize_t sz, i, res_offset;
9941 Py_UCS4 maxchar;
9942 Py_UCS4 item_maxchar;
9943 int use_memcpy;
9944 unsigned char *res_data = NULL, *sep_data = NULL;
9945 PyObject *last_obj;
9946 unsigned int kind = 0;
9947
Tim Peters05eba1f2004-08-27 21:32:02 +00009948 /* If empty sequence, return u"". */
9949 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009950 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009951 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009952
Tim Peters05eba1f2004-08-27 21:32:02 +00009953 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009954 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009955 if (seqlen == 1) {
9956 if (PyUnicode_CheckExact(items[0])) {
9957 res = items[0];
9958 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009959 return res;
9960 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009961 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009962 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009963 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009964 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009965 /* Set up sep and seplen */
9966 if (separator == NULL) {
9967 /* fall back to a blank space separator */
9968 sep = PyUnicode_FromOrdinal(' ');
9969 if (!sep)
9970 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009971 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009972 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009973 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009974 else {
9975 if (!PyUnicode_Check(separator)) {
9976 PyErr_Format(PyExc_TypeError,
9977 "separator: expected str instance,"
9978 " %.80s found",
9979 Py_TYPE(separator)->tp_name);
9980 goto onError;
9981 }
9982 if (PyUnicode_READY(separator))
9983 goto onError;
9984 sep = separator;
9985 seplen = PyUnicode_GET_LENGTH(separator);
9986 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9987 /* inc refcount to keep this code path symmetric with the
9988 above case of a blank separator */
9989 Py_INCREF(sep);
9990 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009991 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009992 }
9993
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009994 /* There are at least two things to join, or else we have a subclass
9995 * of str in the sequence.
9996 * Do a pre-pass to figure out the total amount of space we'll
9997 * need (sz), and see whether all argument are strings.
9998 */
9999 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010000#ifdef Py_DEBUG
10001 use_memcpy = 0;
10002#else
10003 use_memcpy = 1;
10004#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010005 for (i = 0; i < seqlen; i++) {
10006 const Py_ssize_t old_sz = sz;
10007 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010008 if (!PyUnicode_Check(item)) {
10009 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +020010010 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +000010011 " %.80s found",
10012 i, Py_TYPE(item)->tp_name);
10013 goto onError;
10014 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 if (PyUnicode_READY(item) == -1)
10016 goto onError;
10017 sz += PyUnicode_GET_LENGTH(item);
10018 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010019 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010020 if (i != 0)
10021 sz += seplen;
10022 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
10023 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010024 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010025 goto onError;
10026 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010027 if (use_memcpy && last_obj != NULL) {
10028 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10029 use_memcpy = 0;
10030 }
10031 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010032 }
Tim Petersced69f82003-09-16 20:30:58 +000010033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010034 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010035 if (res == NULL)
10036 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010037
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010038 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010039#ifdef Py_DEBUG
10040 use_memcpy = 0;
10041#else
10042 if (use_memcpy) {
10043 res_data = PyUnicode_1BYTE_DATA(res);
10044 kind = PyUnicode_KIND(res);
10045 if (seplen != 0)
10046 sep_data = PyUnicode_1BYTE_DATA(sep);
10047 }
10048#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010049 if (use_memcpy) {
10050 for (i = 0; i < seqlen; ++i) {
10051 Py_ssize_t itemlen;
10052 item = items[i];
10053
10054 /* Copy item, and maybe the separator. */
10055 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010056 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010057 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010058 kind * seplen);
10059 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010060 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010061
10062 itemlen = PyUnicode_GET_LENGTH(item);
10063 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010064 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010065 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010066 kind * itemlen);
10067 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010068 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010069 }
10070 assert(res_data == PyUnicode_1BYTE_DATA(res)
10071 + kind * PyUnicode_GET_LENGTH(res));
10072 }
10073 else {
10074 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10075 Py_ssize_t itemlen;
10076 item = items[i];
10077
10078 /* Copy item, and maybe the separator. */
10079 if (i && seplen != 0) {
10080 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10081 res_offset += seplen;
10082 }
10083
10084 itemlen = PyUnicode_GET_LENGTH(item);
10085 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010086 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010087 res_offset += itemlen;
10088 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010089 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010090 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010091 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010094 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010096
Benjamin Peterson29060642009-01-31 22:14:21 +000010097 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010099 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010100 return NULL;
10101}
10102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103#define FILL(kind, data, value, start, length) \
10104 do { \
10105 Py_ssize_t i_ = 0; \
10106 assert(kind != PyUnicode_WCHAR_KIND); \
10107 switch ((kind)) { \
10108 case PyUnicode_1BYTE_KIND: { \
10109 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010110 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 break; \
10112 } \
10113 case PyUnicode_2BYTE_KIND: { \
10114 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10115 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10116 break; \
10117 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010118 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10120 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10121 break; \
10122 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010123 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 } \
10125 } while (0)
10126
Victor Stinnerd3f08822012-05-29 12:57:52 +020010127void
10128_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10129 Py_UCS4 fill_char)
10130{
10131 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10132 const void *data = PyUnicode_DATA(unicode);
10133 assert(PyUnicode_IS_READY(unicode));
10134 assert(unicode_modifiable(unicode));
10135 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10136 assert(start >= 0);
10137 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10138 FILL(kind, data, fill_char, start, length);
10139}
10140
Victor Stinner3fe55312012-01-04 00:33:50 +010010141Py_ssize_t
10142PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10143 Py_UCS4 fill_char)
10144{
10145 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010146
10147 if (!PyUnicode_Check(unicode)) {
10148 PyErr_BadInternalCall();
10149 return -1;
10150 }
10151 if (PyUnicode_READY(unicode) == -1)
10152 return -1;
10153 if (unicode_check_modifiable(unicode))
10154 return -1;
10155
Victor Stinnerd3f08822012-05-29 12:57:52 +020010156 if (start < 0) {
10157 PyErr_SetString(PyExc_IndexError, "string index out of range");
10158 return -1;
10159 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010160 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10161 PyErr_SetString(PyExc_ValueError,
10162 "fill character is bigger than "
10163 "the string maximum character");
10164 return -1;
10165 }
10166
10167 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10168 length = Py_MIN(maxlen, length);
10169 if (length <= 0)
10170 return 0;
10171
Victor Stinnerd3f08822012-05-29 12:57:52 +020010172 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010173 return length;
10174}
10175
Victor Stinner9310abb2011-10-05 00:59:23 +020010176static PyObject *
10177pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010178 Py_ssize_t left,
10179 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 PyObject *u;
10183 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010184 int kind;
10185 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186
10187 if (left < 0)
10188 left = 0;
10189 if (right < 0)
10190 right = 0;
10191
Victor Stinnerc4b49542011-12-11 22:44:26 +010010192 if (left == 0 && right == 0)
10193 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10196 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010197 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10198 return NULL;
10199 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010201 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010203 if (!u)
10204 return NULL;
10205
10206 kind = PyUnicode_KIND(u);
10207 data = PyUnicode_DATA(u);
10208 if (left)
10209 FILL(kind, data, fill, 0, left);
10210 if (right)
10211 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010212 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010213 assert(_PyUnicode_CheckConsistency(u, 1));
10214 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010215}
10216
Alexander Belopolsky40018472011-02-26 01:02:56 +000010217PyObject *
10218PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010219{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010220 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010221
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010222 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224
Benjamin Petersonead6b532011-12-20 17:23:42 -060010225 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010227 if (PyUnicode_IS_ASCII(string))
10228 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010229 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010230 PyUnicode_GET_LENGTH(string), keepends);
10231 else
10232 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010233 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010234 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 break;
10236 case PyUnicode_2BYTE_KIND:
10237 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010238 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 PyUnicode_GET_LENGTH(string), keepends);
10240 break;
10241 case PyUnicode_4BYTE_KIND:
10242 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010243 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 PyUnicode_GET_LENGTH(string), keepends);
10245 break;
10246 default:
10247 assert(0);
10248 list = 0;
10249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251}
10252
Alexander Belopolsky40018472011-02-26 01:02:56 +000010253static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010254split(PyObject *self,
10255 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010256 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010257{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010258 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 void *buf1, *buf2;
10260 Py_ssize_t len1, len2;
10261 PyObject* out;
10262
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010264 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 if (PyUnicode_READY(self) == -1)
10267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010270 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010272 if (PyUnicode_IS_ASCII(self))
10273 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010274 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010275 PyUnicode_GET_LENGTH(self), maxcount
10276 );
10277 else
10278 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010279 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010280 PyUnicode_GET_LENGTH(self), maxcount
10281 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 case PyUnicode_2BYTE_KIND:
10283 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010284 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 PyUnicode_GET_LENGTH(self), maxcount
10286 );
10287 case PyUnicode_4BYTE_KIND:
10288 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010289 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 PyUnicode_GET_LENGTH(self), maxcount
10291 );
10292 default:
10293 assert(0);
10294 return NULL;
10295 }
10296
10297 if (PyUnicode_READY(substring) == -1)
10298 return NULL;
10299
10300 kind1 = PyUnicode_KIND(self);
10301 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 len1 = PyUnicode_GET_LENGTH(self);
10303 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010304 if (kind1 < kind2 || len1 < len2) {
10305 out = PyList_New(1);
10306 if (out == NULL)
10307 return NULL;
10308 Py_INCREF(self);
10309 PyList_SET_ITEM(out, 0, self);
10310 return out;
10311 }
10312 buf1 = PyUnicode_DATA(self);
10313 buf2 = PyUnicode_DATA(substring);
10314 if (kind2 != kind1) {
10315 buf2 = _PyUnicode_AsKind(substring, kind1);
10316 if (!buf2)
10317 return NULL;
10318 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010320 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010322 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10323 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010324 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010325 else
10326 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010327 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 break;
10329 case PyUnicode_2BYTE_KIND:
10330 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010331 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 break;
10333 case PyUnicode_4BYTE_KIND:
10334 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010335 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 break;
10337 default:
10338 out = NULL;
10339 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010340 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 PyMem_Free(buf2);
10342 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010343}
10344
Alexander Belopolsky40018472011-02-26 01:02:56 +000010345static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010346rsplit(PyObject *self,
10347 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010348 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010349{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010350 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 void *buf1, *buf2;
10352 Py_ssize_t len1, len2;
10353 PyObject* out;
10354
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010355 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010356 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 if (PyUnicode_READY(self) == -1)
10359 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010362 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010364 if (PyUnicode_IS_ASCII(self))
10365 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010366 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010367 PyUnicode_GET_LENGTH(self), maxcount
10368 );
10369 else
10370 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010371 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010372 PyUnicode_GET_LENGTH(self), maxcount
10373 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 case PyUnicode_2BYTE_KIND:
10375 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010376 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 PyUnicode_GET_LENGTH(self), maxcount
10378 );
10379 case PyUnicode_4BYTE_KIND:
10380 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010381 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 PyUnicode_GET_LENGTH(self), maxcount
10383 );
10384 default:
10385 assert(0);
10386 return NULL;
10387 }
10388
10389 if (PyUnicode_READY(substring) == -1)
10390 return NULL;
10391
10392 kind1 = PyUnicode_KIND(self);
10393 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 len1 = PyUnicode_GET_LENGTH(self);
10395 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010396 if (kind1 < kind2 || len1 < len2) {
10397 out = PyList_New(1);
10398 if (out == NULL)
10399 return NULL;
10400 Py_INCREF(self);
10401 PyList_SET_ITEM(out, 0, self);
10402 return out;
10403 }
10404 buf1 = PyUnicode_DATA(self);
10405 buf2 = PyUnicode_DATA(substring);
10406 if (kind2 != kind1) {
10407 buf2 = _PyUnicode_AsKind(substring, kind1);
10408 if (!buf2)
10409 return NULL;
10410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010412 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010414 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10415 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010416 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010417 else
10418 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010419 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 break;
10421 case PyUnicode_2BYTE_KIND:
10422 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010423 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 break;
10425 case PyUnicode_4BYTE_KIND:
10426 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010427 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 break;
10429 default:
10430 out = NULL;
10431 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010432 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 PyMem_Free(buf2);
10434 return out;
10435}
10436
10437static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010438anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10439 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010441 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010443 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10444 return asciilib_find(buf1, len1, buf2, len2, offset);
10445 else
10446 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 case PyUnicode_2BYTE_KIND:
10448 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10449 case PyUnicode_4BYTE_KIND:
10450 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10451 }
10452 assert(0);
10453 return -1;
10454}
10455
10456static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010457anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10458 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010460 switch (kind) {
10461 case PyUnicode_1BYTE_KIND:
10462 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10463 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10464 else
10465 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10466 case PyUnicode_2BYTE_KIND:
10467 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10468 case PyUnicode_4BYTE_KIND:
10469 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10470 }
10471 assert(0);
10472 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010473}
10474
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010475static void
10476replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10477 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10478{
10479 int kind = PyUnicode_KIND(u);
10480 void *data = PyUnicode_DATA(u);
10481 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10482 if (kind == PyUnicode_1BYTE_KIND) {
10483 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10484 (Py_UCS1 *)data + len,
10485 u1, u2, maxcount);
10486 }
10487 else if (kind == PyUnicode_2BYTE_KIND) {
10488 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10489 (Py_UCS2 *)data + len,
10490 u1, u2, maxcount);
10491 }
10492 else {
10493 assert(kind == PyUnicode_4BYTE_KIND);
10494 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10495 (Py_UCS4 *)data + len,
10496 u1, u2, maxcount);
10497 }
10498}
10499
Alexander Belopolsky40018472011-02-26 01:02:56 +000010500static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501replace(PyObject *self, PyObject *str1,
10502 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 PyObject *u;
10505 char *sbuf = PyUnicode_DATA(self);
10506 char *buf1 = PyUnicode_DATA(str1);
10507 char *buf2 = PyUnicode_DATA(str2);
10508 int srelease = 0, release1 = 0, release2 = 0;
10509 int skind = PyUnicode_KIND(self);
10510 int kind1 = PyUnicode_KIND(str1);
10511 int kind2 = PyUnicode_KIND(str2);
10512 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10513 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10514 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010515 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010516 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517
10518 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010519 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010521 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522
Victor Stinner59de0ee2011-10-07 10:01:28 +020010523 if (str1 == str2)
10524 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525
Victor Stinner49a0a212011-10-12 23:46:10 +020010526 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010527 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10528 if (maxchar < maxchar_str1)
10529 /* substring too wide to be present */
10530 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010531 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10532 /* Replacing str1 with str2 may cause a maxchar reduction in the
10533 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010534 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010535 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010538 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010540 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010542 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010543 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010544 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010545
Victor Stinner69ed0f42013-04-09 21:48:24 +020010546 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010547 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010548 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010549 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010550 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010552 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010554
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010555 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10556 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010557 }
10558 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 int rkind = skind;
10560 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010561 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 if (kind1 < rkind) {
10564 /* widen substring */
10565 buf1 = _PyUnicode_AsKind(str1, rkind);
10566 if (!buf1) goto error;
10567 release1 = 1;
10568 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010569 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010570 if (i < 0)
10571 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 if (rkind > kind2) {
10573 /* widen replacement */
10574 buf2 = _PyUnicode_AsKind(str2, rkind);
10575 if (!buf2) goto error;
10576 release2 = 1;
10577 }
10578 else if (rkind < kind2) {
10579 /* widen self and buf1 */
10580 rkind = kind2;
10581 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010582 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 sbuf = _PyUnicode_AsKind(self, rkind);
10584 if (!sbuf) goto error;
10585 srelease = 1;
10586 buf1 = _PyUnicode_AsKind(str1, rkind);
10587 if (!buf1) goto error;
10588 release1 = 1;
10589 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010590 u = PyUnicode_New(slen, maxchar);
10591 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010593 assert(PyUnicode_KIND(u) == rkind);
10594 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010595
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010596 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010597 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010598 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010600 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010602
10603 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010604 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010605 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010606 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010607 if (i == -1)
10608 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010609 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010611 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010613 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010615 }
10616 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010618 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 int rkind = skind;
10620 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010623 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 buf1 = _PyUnicode_AsKind(str1, rkind);
10625 if (!buf1) goto error;
10626 release1 = 1;
10627 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010628 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010629 if (n == 0)
10630 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010632 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 buf2 = _PyUnicode_AsKind(str2, rkind);
10634 if (!buf2) goto error;
10635 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010638 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 rkind = kind2;
10640 sbuf = _PyUnicode_AsKind(self, rkind);
10641 if (!sbuf) goto error;
10642 srelease = 1;
10643 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010644 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 buf1 = _PyUnicode_AsKind(str1, rkind);
10646 if (!buf1) goto error;
10647 release1 = 1;
10648 }
10649 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10650 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010651 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 PyErr_SetString(PyExc_OverflowError,
10653 "replace string is too long");
10654 goto error;
10655 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010656 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010657 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010658 _Py_INCREF_UNICODE_EMPTY();
10659 if (!unicode_empty)
10660 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010661 u = unicode_empty;
10662 goto done;
10663 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010664 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 PyErr_SetString(PyExc_OverflowError,
10666 "replace string is too long");
10667 goto error;
10668 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010669 u = PyUnicode_New(new_size, maxchar);
10670 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010672 assert(PyUnicode_KIND(u) == rkind);
10673 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 ires = i = 0;
10675 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010676 while (n-- > 0) {
10677 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010678 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010679 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010680 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010681 if (j == -1)
10682 break;
10683 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010684 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010685 memcpy(res + rkind * ires,
10686 sbuf + rkind * i,
10687 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010689 }
10690 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010692 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010694 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010696 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010698 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010700 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010701 memcpy(res + rkind * ires,
10702 sbuf + rkind * i,
10703 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010704 }
10705 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010706 /* interleave */
10707 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010708 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010710 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010711 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010712 if (--n <= 0)
10713 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010714 memcpy(res + rkind * ires,
10715 sbuf + rkind * i,
10716 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 ires++;
10718 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010719 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010720 memcpy(res + rkind * ires,
10721 sbuf + rkind * i,
10722 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010723 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010724 }
10725
10726 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010727 unicode_adjust_maxchar(&u);
10728 if (u == NULL)
10729 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010731
10732 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 if (srelease)
10734 PyMem_FREE(sbuf);
10735 if (release1)
10736 PyMem_FREE(buf1);
10737 if (release2)
10738 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010739 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010741
Benjamin Peterson29060642009-01-31 22:14:21 +000010742 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010743 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 if (srelease)
10745 PyMem_FREE(sbuf);
10746 if (release1)
10747 PyMem_FREE(buf1);
10748 if (release2)
10749 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010750 return unicode_result_unchanged(self);
10751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 error:
10753 if (srelease && sbuf)
10754 PyMem_FREE(sbuf);
10755 if (release1 && buf1)
10756 PyMem_FREE(buf1);
10757 if (release2 && buf2)
10758 PyMem_FREE(buf2);
10759 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760}
10761
10762/* --- Unicode Object Methods --------------------------------------------- */
10763
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010764PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010765 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766\n\
10767Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010768characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769
10770static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010771unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010773 if (PyUnicode_READY(self) == -1)
10774 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010775 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776}
10777
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010778PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010779 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780\n\
10781Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010782have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783
10784static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010785unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010787 if (PyUnicode_READY(self) == -1)
10788 return NULL;
10789 if (PyUnicode_GET_LENGTH(self) == 0)
10790 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010791 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010792}
10793
Benjamin Petersond5890c82012-01-14 13:23:30 -050010794PyDoc_STRVAR(casefold__doc__,
10795 "S.casefold() -> str\n\
10796\n\
10797Return a version of S suitable for caseless comparisons.");
10798
10799static PyObject *
10800unicode_casefold(PyObject *self)
10801{
10802 if (PyUnicode_READY(self) == -1)
10803 return NULL;
10804 if (PyUnicode_IS_ASCII(self))
10805 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010806 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010807}
10808
10809
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010810/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010811
10812static int
10813convert_uc(PyObject *obj, void *addr)
10814{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010816
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010817 if (!PyUnicode_Check(obj)) {
10818 PyErr_Format(PyExc_TypeError,
10819 "The fill character must be a unicode character, "
10820 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010821 return 0;
10822 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010823 if (PyUnicode_READY(obj) < 0)
10824 return 0;
10825 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010826 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010827 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010828 return 0;
10829 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010830 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010831 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010832}
10833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010834PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010835 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010837Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010838done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839
10840static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010841unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010843 Py_ssize_t marg, left;
10844 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845 Py_UCS4 fillchar = ' ';
10846
Victor Stinnere9a29352011-10-01 02:14:59 +020010847 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849
Benjamin Petersonbac79492012-01-14 13:34:47 -050010850 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010851 return NULL;
10852
Victor Stinnerc4b49542011-12-11 22:44:26 +010010853 if (PyUnicode_GET_LENGTH(self) >= width)
10854 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855
Victor Stinnerc4b49542011-12-11 22:44:26 +010010856 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857 left = marg / 2 + (marg & width & 1);
10858
Victor Stinner9310abb2011-10-05 00:59:23 +020010859 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860}
10861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010862/* This function assumes that str1 and str2 are readied by the caller. */
10863
Marc-André Lemburge5034372000-08-08 08:04:29 +000010864static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010865unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010866{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010867#define COMPARE(TYPE1, TYPE2) \
10868 do { \
10869 TYPE1* p1 = (TYPE1 *)data1; \
10870 TYPE2* p2 = (TYPE2 *)data2; \
10871 TYPE1* end = p1 + len; \
10872 Py_UCS4 c1, c2; \
10873 for (; p1 != end; p1++, p2++) { \
10874 c1 = *p1; \
10875 c2 = *p2; \
10876 if (c1 != c2) \
10877 return (c1 < c2) ? -1 : 1; \
10878 } \
10879 } \
10880 while (0)
10881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010882 int kind1, kind2;
10883 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010884 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010885
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010886 kind1 = PyUnicode_KIND(str1);
10887 kind2 = PyUnicode_KIND(str2);
10888 data1 = PyUnicode_DATA(str1);
10889 data2 = PyUnicode_DATA(str2);
10890 len1 = PyUnicode_GET_LENGTH(str1);
10891 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010892 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010893
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010894 switch(kind1) {
10895 case PyUnicode_1BYTE_KIND:
10896 {
10897 switch(kind2) {
10898 case PyUnicode_1BYTE_KIND:
10899 {
10900 int cmp = memcmp(data1, data2, len);
10901 /* normalize result of memcmp() into the range [-1; 1] */
10902 if (cmp < 0)
10903 return -1;
10904 if (cmp > 0)
10905 return 1;
10906 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010907 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010908 case PyUnicode_2BYTE_KIND:
10909 COMPARE(Py_UCS1, Py_UCS2);
10910 break;
10911 case PyUnicode_4BYTE_KIND:
10912 COMPARE(Py_UCS1, Py_UCS4);
10913 break;
10914 default:
10915 assert(0);
10916 }
10917 break;
10918 }
10919 case PyUnicode_2BYTE_KIND:
10920 {
10921 switch(kind2) {
10922 case PyUnicode_1BYTE_KIND:
10923 COMPARE(Py_UCS2, Py_UCS1);
10924 break;
10925 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010926 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010927 COMPARE(Py_UCS2, Py_UCS2);
10928 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010929 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010930 case PyUnicode_4BYTE_KIND:
10931 COMPARE(Py_UCS2, Py_UCS4);
10932 break;
10933 default:
10934 assert(0);
10935 }
10936 break;
10937 }
10938 case PyUnicode_4BYTE_KIND:
10939 {
10940 switch(kind2) {
10941 case PyUnicode_1BYTE_KIND:
10942 COMPARE(Py_UCS4, Py_UCS1);
10943 break;
10944 case PyUnicode_2BYTE_KIND:
10945 COMPARE(Py_UCS4, Py_UCS2);
10946 break;
10947 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010948 {
10949#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10950 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10951 /* normalize result of wmemcmp() into the range [-1; 1] */
10952 if (cmp < 0)
10953 return -1;
10954 if (cmp > 0)
10955 return 1;
10956#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010957 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010958#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010959 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010960 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010961 default:
10962 assert(0);
10963 }
10964 break;
10965 }
10966 default:
10967 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010968 }
10969
Victor Stinner770e19e2012-10-04 22:59:45 +020010970 if (len1 == len2)
10971 return 0;
10972 if (len1 < len2)
10973 return -1;
10974 else
10975 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010976
10977#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010978}
10979
Benjamin Peterson621b4302016-09-09 13:54:34 -070010980static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010981unicode_compare_eq(PyObject *str1, PyObject *str2)
10982{
10983 int kind;
10984 void *data1, *data2;
10985 Py_ssize_t len;
10986 int cmp;
10987
Victor Stinnere5567ad2012-10-23 02:48:49 +020010988 len = PyUnicode_GET_LENGTH(str1);
10989 if (PyUnicode_GET_LENGTH(str2) != len)
10990 return 0;
10991 kind = PyUnicode_KIND(str1);
10992 if (PyUnicode_KIND(str2) != kind)
10993 return 0;
10994 data1 = PyUnicode_DATA(str1);
10995 data2 = PyUnicode_DATA(str2);
10996
10997 cmp = memcmp(data1, data2, len * kind);
10998 return (cmp == 0);
10999}
11000
11001
Alexander Belopolsky40018472011-02-26 01:02:56 +000011002int
11003PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11006 if (PyUnicode_READY(left) == -1 ||
11007 PyUnicode_READY(right) == -1)
11008 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011009
11010 /* a string is equal to itself */
11011 if (left == right)
11012 return 0;
11013
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011014 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011015 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011016 PyErr_Format(PyExc_TypeError,
11017 "Can't compare %.100s and %.100s",
11018 left->ob_type->tp_name,
11019 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020 return -1;
11021}
11022
Martin v. Löwis5b222132007-06-10 09:51:05 +000011023int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010011024_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
11025{
11026 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
11027 if (right_str == NULL)
11028 return -1;
11029 return PyUnicode_Compare(left, right_str);
11030}
11031
11032int
Martin v. Löwis5b222132007-06-10 09:51:05 +000011033PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11034{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 Py_ssize_t i;
11036 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 Py_UCS4 chr;
11038
Victor Stinner910337b2011-10-03 03:20:16 +020011039 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040 if (PyUnicode_READY(uni) == -1)
11041 return -1;
11042 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011043 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011044 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011045 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011046 size_t len, len2 = strlen(str);
11047 int cmp;
11048
11049 len = Py_MIN(len1, len2);
11050 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011051 if (cmp != 0) {
11052 if (cmp < 0)
11053 return -1;
11054 else
11055 return 1;
11056 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011057 if (len1 > len2)
11058 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011059 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011060 return -1; /* str is longer */
11061 return 0;
11062 }
11063 else {
11064 void *data = PyUnicode_DATA(uni);
11065 /* Compare Unicode string and source character set string */
11066 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011067 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011068 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11069 /* This check keeps Python strings that end in '\0' from comparing equal
11070 to C strings identical up to that point. */
11071 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11072 return 1; /* uni is longer */
11073 if (str[i])
11074 return -1; /* str is longer */
11075 return 0;
11076 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011077}
11078
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011079
Benjamin Peterson29060642009-01-31 22:14:21 +000011080#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011081 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011082
Alexander Belopolsky40018472011-02-26 01:02:56 +000011083PyObject *
11084PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011085{
11086 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011087 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011088
Victor Stinnere5567ad2012-10-23 02:48:49 +020011089 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11090 Py_RETURN_NOTIMPLEMENTED;
11091
11092 if (PyUnicode_READY(left) == -1 ||
11093 PyUnicode_READY(right) == -1)
11094 return NULL;
11095
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011096 if (left == right) {
11097 switch (op) {
11098 case Py_EQ:
11099 case Py_LE:
11100 case Py_GE:
11101 /* a string is equal to itself */
11102 v = Py_True;
11103 break;
11104 case Py_NE:
11105 case Py_LT:
11106 case Py_GT:
11107 v = Py_False;
11108 break;
11109 default:
11110 PyErr_BadArgument();
11111 return NULL;
11112 }
11113 }
11114 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011115 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011116 result ^= (op == Py_NE);
11117 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011118 }
11119 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011120 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011121
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011122 /* Convert the return value to a Boolean */
11123 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011124 case Py_LE:
11125 v = TEST_COND(result <= 0);
11126 break;
11127 case Py_GE:
11128 v = TEST_COND(result >= 0);
11129 break;
11130 case Py_LT:
11131 v = TEST_COND(result == -1);
11132 break;
11133 case Py_GT:
11134 v = TEST_COND(result == 1);
11135 break;
11136 default:
11137 PyErr_BadArgument();
11138 return NULL;
11139 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011140 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011141 Py_INCREF(v);
11142 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011143}
11144
Alexander Belopolsky40018472011-02-26 01:02:56 +000011145int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011146_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11147{
11148 return unicode_eq(aa, bb);
11149}
11150
11151int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011152PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011153{
Victor Stinner77282cb2013-04-14 19:22:47 +020011154 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 void *buf1, *buf2;
11156 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011157 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011158
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011159 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011160 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011161 "'in <string>' requires string as left operand, not %.100s",
11162 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011163 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011164 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011165 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011166 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011167 if (ensure_unicode(str) < 0)
11168 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011171 kind2 = PyUnicode_KIND(substr);
11172 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011173 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011175 len2 = PyUnicode_GET_LENGTH(substr);
11176 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011177 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011178 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011179 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011180 if (len2 == 1) {
11181 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11182 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011183 return result;
11184 }
11185 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011186 buf2 = _PyUnicode_AsKind(substr, kind1);
11187 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011188 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011189 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190
Victor Stinner77282cb2013-04-14 19:22:47 +020011191 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 case PyUnicode_1BYTE_KIND:
11193 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11194 break;
11195 case PyUnicode_2BYTE_KIND:
11196 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11197 break;
11198 case PyUnicode_4BYTE_KIND:
11199 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11200 break;
11201 default:
11202 result = -1;
11203 assert(0);
11204 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011205
Victor Stinner77282cb2013-04-14 19:22:47 +020011206 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207 PyMem_Free(buf2);
11208
Guido van Rossum403d68b2000-03-13 15:55:09 +000011209 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011210}
11211
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212/* Concat to string or Unicode object giving a new Unicode object. */
11213
Alexander Belopolsky40018472011-02-26 01:02:56 +000011214PyObject *
11215PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011217 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011218 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011219 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011221 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11222 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223
11224 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011225 if (left == unicode_empty)
11226 return PyUnicode_FromObject(right);
11227 if (right == unicode_empty)
11228 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011230 left_len = PyUnicode_GET_LENGTH(left);
11231 right_len = PyUnicode_GET_LENGTH(right);
11232 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011233 PyErr_SetString(PyExc_OverflowError,
11234 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011235 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011236 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011237 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011238
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011239 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11240 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011241 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011242
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011244 result = PyUnicode_New(new_len, maxchar);
11245 if (result == NULL)
11246 return NULL;
11247 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11248 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11249 assert(_PyUnicode_CheckConsistency(result, 1));
11250 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251}
11252
Walter Dörwald1ab83302007-05-18 17:15:44 +000011253void
Victor Stinner23e56682011-10-03 03:54:37 +020011254PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011255{
Victor Stinner23e56682011-10-03 03:54:37 +020011256 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011257 Py_UCS4 maxchar, maxchar2;
11258 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011259
11260 if (p_left == NULL) {
11261 if (!PyErr_Occurred())
11262 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011263 return;
11264 }
Victor Stinner23e56682011-10-03 03:54:37 +020011265 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011266 if (right == NULL || left == NULL
11267 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011268 if (!PyErr_Occurred())
11269 PyErr_BadInternalCall();
11270 goto error;
11271 }
11272
Benjamin Petersonbac79492012-01-14 13:34:47 -050011273 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011274 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011275 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011276 goto error;
11277
Victor Stinner488fa492011-12-12 00:01:39 +010011278 /* Shortcuts */
11279 if (left == unicode_empty) {
11280 Py_DECREF(left);
11281 Py_INCREF(right);
11282 *p_left = right;
11283 return;
11284 }
11285 if (right == unicode_empty)
11286 return;
11287
11288 left_len = PyUnicode_GET_LENGTH(left);
11289 right_len = PyUnicode_GET_LENGTH(right);
11290 if (left_len > PY_SSIZE_T_MAX - right_len) {
11291 PyErr_SetString(PyExc_OverflowError,
11292 "strings are too large to concat");
11293 goto error;
11294 }
11295 new_len = left_len + right_len;
11296
11297 if (unicode_modifiable(left)
11298 && PyUnicode_CheckExact(right)
11299 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011300 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11301 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011302 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011303 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011304 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11305 {
11306 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011307 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011308 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011309
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011310 /* copy 'right' into the newly allocated area of 'left' */
11311 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011312 }
Victor Stinner488fa492011-12-12 00:01:39 +010011313 else {
11314 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11315 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011316 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011317
Victor Stinner488fa492011-12-12 00:01:39 +010011318 /* Concat the two Unicode strings */
11319 res = PyUnicode_New(new_len, maxchar);
11320 if (res == NULL)
11321 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011322 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11323 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011324 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011325 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011326 }
11327 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011328 return;
11329
11330error:
Victor Stinner488fa492011-12-12 00:01:39 +010011331 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011332}
11333
11334void
11335PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11336{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011337 PyUnicode_Append(pleft, right);
11338 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011339}
11340
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011341/*
11342Wraps stringlib_parse_args_finds() and additionally ensures that the
11343first argument is a unicode object.
11344*/
11345
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011346static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011347parse_args_finds_unicode(const char * function_name, PyObject *args,
11348 PyObject **substring,
11349 Py_ssize_t *start, Py_ssize_t *end)
11350{
11351 if(stringlib_parse_args_finds(function_name, args, substring,
11352 start, end)) {
11353 if (ensure_unicode(*substring) < 0)
11354 return 0;
11355 return 1;
11356 }
11357 return 0;
11358}
11359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011360PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011361 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011363Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011364string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011365interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366
11367static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011368unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011370 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011371 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011372 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011374 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375 void *buf1, *buf2;
11376 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011377
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011378 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011379 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381 kind1 = PyUnicode_KIND(self);
11382 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011383 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011384 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386 len1 = PyUnicode_GET_LENGTH(self);
11387 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011389 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011390 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011391
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011392 buf1 = PyUnicode_DATA(self);
11393 buf2 = PyUnicode_DATA(substring);
11394 if (kind2 != kind1) {
11395 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011396 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011397 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011398 }
11399 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 case PyUnicode_1BYTE_KIND:
11401 iresult = ucs1lib_count(
11402 ((Py_UCS1*)buf1) + start, end - start,
11403 buf2, len2, PY_SSIZE_T_MAX
11404 );
11405 break;
11406 case PyUnicode_2BYTE_KIND:
11407 iresult = ucs2lib_count(
11408 ((Py_UCS2*)buf1) + start, end - start,
11409 buf2, len2, PY_SSIZE_T_MAX
11410 );
11411 break;
11412 case PyUnicode_4BYTE_KIND:
11413 iresult = ucs4lib_count(
11414 ((Py_UCS4*)buf1) + start, end - start,
11415 buf2, len2, PY_SSIZE_T_MAX
11416 );
11417 break;
11418 default:
11419 assert(0); iresult = 0;
11420 }
11421
11422 result = PyLong_FromSsize_t(iresult);
11423
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011424 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011425 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427 return result;
11428}
11429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011430PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011431 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011433Encode S using the codec registered for encoding. Default encoding\n\
11434is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011435handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011436a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11437'xmlcharrefreplace' as well as any other name registered with\n\
11438codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439
11440static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011441unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011443 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444 char *encoding = NULL;
11445 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011446
Benjamin Peterson308d6372009-09-18 21:42:35 +000011447 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11448 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011450 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011451}
11452
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011453PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011454 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455\n\
11456Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011457If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458
11459static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011460unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011462 Py_ssize_t i, j, line_pos, src_len, incr;
11463 Py_UCS4 ch;
11464 PyObject *u;
11465 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011466 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011468 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011469 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470
Ezio Melotti745d54d2013-11-16 19:10:57 +020011471 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11472 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474
Antoine Pitrou22425222011-10-04 19:10:51 +020011475 if (PyUnicode_READY(self) == -1)
11476 return NULL;
11477
Thomas Wouters7e474022000-07-16 12:04:32 +000011478 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011479 src_len = PyUnicode_GET_LENGTH(self);
11480 i = j = line_pos = 0;
11481 kind = PyUnicode_KIND(self);
11482 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011483 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011484 for (; i < src_len; i++) {
11485 ch = PyUnicode_READ(kind, src_data, i);
11486 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011487 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011488 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011489 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011490 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011491 goto overflow;
11492 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011493 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011494 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011497 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011498 goto overflow;
11499 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011501 if (ch == '\n' || ch == '\r')
11502 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011504 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011505 if (!found)
11506 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011507
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011509 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510 if (!u)
11511 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011512 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
Antoine Pitroue71d5742011-10-04 15:55:09 +020011514 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515
Antoine Pitroue71d5742011-10-04 15:55:09 +020011516 for (; i < src_len; i++) {
11517 ch = PyUnicode_READ(kind, src_data, i);
11518 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011519 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011520 incr = tabsize - (line_pos % tabsize);
11521 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011522 FILL(kind, dest_data, ' ', j, incr);
11523 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011524 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011525 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011526 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011527 line_pos++;
11528 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011529 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011530 if (ch == '\n' || ch == '\r')
11531 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011533 }
11534 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011535 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011536
Antoine Pitroue71d5742011-10-04 15:55:09 +020011537 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011538 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540}
11541
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011542PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011543 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544\n\
11545Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011546such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547arguments start and end are interpreted as in slice notation.\n\
11548\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011549Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550
11551static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011554 /* initialize variables to prevent gcc warning */
11555 PyObject *substring = NULL;
11556 Py_ssize_t start = 0;
11557 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011558 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011560 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011563 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011565
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011566 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011568 if (result == -2)
11569 return NULL;
11570
Christian Heimes217cfd12007-12-02 14:31:20 +000011571 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572}
11573
11574static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011575unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011577 void *data;
11578 enum PyUnicode_Kind kind;
11579 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011580
11581 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11582 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011584 }
11585 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11586 PyErr_SetString(PyExc_IndexError, "string index out of range");
11587 return NULL;
11588 }
11589 kind = PyUnicode_KIND(self);
11590 data = PyUnicode_DATA(self);
11591 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011592 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593}
11594
Guido van Rossumc2504932007-09-18 19:42:40 +000011595/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011596 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011597static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011598unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599{
Guido van Rossumc2504932007-09-18 19:42:40 +000011600 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011601 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011602
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011603#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011604 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011605#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011606 if (_PyUnicode_HASH(self) != -1)
11607 return _PyUnicode_HASH(self);
11608 if (PyUnicode_READY(self) == -1)
11609 return -1;
11610 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011611 /*
11612 We make the hash of the empty string be 0, rather than using
11613 (prefix ^ suffix), since this slightly obfuscates the hash secret
11614 */
11615 if (len == 0) {
11616 _PyUnicode_HASH(self) = 0;
11617 return 0;
11618 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011619 x = _Py_HashBytes(PyUnicode_DATA(self),
11620 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011622 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623}
11624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011625PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011626 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011628Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629
11630static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011633 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011634 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011635 PyObject *substring = NULL;
11636 Py_ssize_t start = 0;
11637 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011639 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011642 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011645 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647 if (result == -2)
11648 return NULL;
11649
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650 if (result < 0) {
11651 PyErr_SetString(PyExc_ValueError, "substring not found");
11652 return NULL;
11653 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011654
Christian Heimes217cfd12007-12-02 14:31:20 +000011655 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656}
11657
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011658PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011659 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011661Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011662at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663
11664static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011665unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 Py_ssize_t i, length;
11668 int kind;
11669 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670 int cased;
11671
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672 if (PyUnicode_READY(self) == -1)
11673 return NULL;
11674 length = PyUnicode_GET_LENGTH(self);
11675 kind = PyUnicode_KIND(self);
11676 data = PyUnicode_DATA(self);
11677
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 if (length == 1)
11680 return PyBool_FromLong(
11681 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011683 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011685 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011686
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688 for (i = 0; i < length; i++) {
11689 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011690
Benjamin Peterson29060642009-01-31 22:14:21 +000011691 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11692 return PyBool_FromLong(0);
11693 else if (!cased && Py_UNICODE_ISLOWER(ch))
11694 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011696 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697}
11698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011699PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011700 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011702Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011703at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704
11705static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011706unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011708 Py_ssize_t i, length;
11709 int kind;
11710 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711 int cased;
11712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011713 if (PyUnicode_READY(self) == -1)
11714 return NULL;
11715 length = PyUnicode_GET_LENGTH(self);
11716 kind = PyUnicode_KIND(self);
11717 data = PyUnicode_DATA(self);
11718
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720 if (length == 1)
11721 return PyBool_FromLong(
11722 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011724 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011725 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011726 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011727
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011729 for (i = 0; i < length; i++) {
11730 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011731
Benjamin Peterson29060642009-01-31 22:14:21 +000011732 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11733 return PyBool_FromLong(0);
11734 else if (!cased && Py_UNICODE_ISUPPER(ch))
11735 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011737 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738}
11739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011740PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011741 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011743Return True if S is a titlecased string and there is at least one\n\
11744character in S, i.e. upper- and titlecase characters may only\n\
11745follow uncased characters and lowercase characters only cased ones.\n\
11746Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747
11748static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011749unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 Py_ssize_t i, length;
11752 int kind;
11753 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754 int cased, previous_is_cased;
11755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 if (PyUnicode_READY(self) == -1)
11757 return NULL;
11758 length = PyUnicode_GET_LENGTH(self);
11759 kind = PyUnicode_KIND(self);
11760 data = PyUnicode_DATA(self);
11761
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 if (length == 1) {
11764 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11765 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11766 (Py_UNICODE_ISUPPER(ch) != 0));
11767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011769 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011771 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011772
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773 cased = 0;
11774 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 for (i = 0; i < length; i++) {
11776 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011777
Benjamin Peterson29060642009-01-31 22:14:21 +000011778 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11779 if (previous_is_cased)
11780 return PyBool_FromLong(0);
11781 previous_is_cased = 1;
11782 cased = 1;
11783 }
11784 else if (Py_UNICODE_ISLOWER(ch)) {
11785 if (!previous_is_cased)
11786 return PyBool_FromLong(0);
11787 previous_is_cased = 1;
11788 cased = 1;
11789 }
11790 else
11791 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011793 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794}
11795
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011796PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011797 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011799Return True if all characters in S are whitespace\n\
11800and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801
11802static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011803unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 Py_ssize_t i, length;
11806 int kind;
11807 void *data;
11808
11809 if (PyUnicode_READY(self) == -1)
11810 return NULL;
11811 length = PyUnicode_GET_LENGTH(self);
11812 kind = PyUnicode_KIND(self);
11813 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 if (length == 1)
11817 return PyBool_FromLong(
11818 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011820 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011822 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 for (i = 0; i < length; i++) {
11825 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011826 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011827 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011829 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830}
11831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011832PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011834\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011835Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011836and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011837
11838static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011839unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011840{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 Py_ssize_t i, length;
11842 int kind;
11843 void *data;
11844
11845 if (PyUnicode_READY(self) == -1)
11846 return NULL;
11847 length = PyUnicode_GET_LENGTH(self);
11848 kind = PyUnicode_KIND(self);
11849 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011850
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011851 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 if (length == 1)
11853 return PyBool_FromLong(
11854 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011855
11856 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011858 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 for (i = 0; i < length; i++) {
11861 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011862 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011863 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011864 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011865}
11866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011867PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011868 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011869\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011870Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011871and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011872
11873static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011874unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011875{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 int kind;
11877 void *data;
11878 Py_ssize_t len, i;
11879
11880 if (PyUnicode_READY(self) == -1)
11881 return NULL;
11882
11883 kind = PyUnicode_KIND(self);
11884 data = PyUnicode_DATA(self);
11885 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011886
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011887 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 if (len == 1) {
11889 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11890 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11891 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011892
11893 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011895 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 for (i = 0; i < len; i++) {
11898 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011899 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011900 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011901 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011902 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011903}
11904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011905PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011908Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011909False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910
11911static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011912unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 Py_ssize_t i, length;
11915 int kind;
11916 void *data;
11917
11918 if (PyUnicode_READY(self) == -1)
11919 return NULL;
11920 length = PyUnicode_GET_LENGTH(self);
11921 kind = PyUnicode_KIND(self);
11922 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 if (length == 1)
11926 return PyBool_FromLong(
11927 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011929 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011931 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 for (i = 0; i < length; i++) {
11934 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011935 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011937 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938}
11939
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011940PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011943Return True if all characters in S are digits\n\
11944and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945
11946static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011947unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011949 Py_ssize_t i, length;
11950 int kind;
11951 void *data;
11952
11953 if (PyUnicode_READY(self) == -1)
11954 return NULL;
11955 length = PyUnicode_GET_LENGTH(self);
11956 kind = PyUnicode_KIND(self);
11957 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 if (length == 1) {
11961 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11962 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011965 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011967 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 for (i = 0; i < length; i++) {
11970 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011971 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011973 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974}
11975
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011976PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011977 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011979Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011980False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981
11982static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011983unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 Py_ssize_t i, length;
11986 int kind;
11987 void *data;
11988
11989 if (PyUnicode_READY(self) == -1)
11990 return NULL;
11991 length = PyUnicode_GET_LENGTH(self);
11992 kind = PyUnicode_KIND(self);
11993 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 if (length == 1)
11997 return PyBool_FromLong(
11998 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012000 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012002 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 for (i = 0; i < length; i++) {
12005 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012006 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012008 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009}
12010
Martin v. Löwis47383402007-08-15 07:32:56 +000012011int
12012PyUnicode_IsIdentifier(PyObject *self)
12013{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 int kind;
12015 void *data;
12016 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012017 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 if (PyUnicode_READY(self) == -1) {
12020 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 }
12023
12024 /* Special case for empty strings */
12025 if (PyUnicode_GET_LENGTH(self) == 0)
12026 return 0;
12027 kind = PyUnicode_KIND(self);
12028 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012029
12030 /* PEP 3131 says that the first character must be in
12031 XID_Start and subsequent characters in XID_Continue,
12032 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012033 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012034 letters, digits, underscore). However, given the current
12035 definition of XID_Start and XID_Continue, it is sufficient
12036 to check just for these, except that _ must be allowed
12037 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012039 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012040 return 0;
12041
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012042 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012044 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012045 return 1;
12046}
12047
12048PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012049 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000012050\n\
12051Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012052to the language definition.\n\
12053\n\
12054Use keyword.iskeyword() to test for reserved identifiers\n\
12055such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012056
12057static PyObject*
12058unicode_isidentifier(PyObject *self)
12059{
12060 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12061}
12062
Georg Brandl559e5d72008-06-11 18:37:52 +000012063PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012064 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012065\n\
12066Return True if all characters in S are considered\n\
12067printable in repr() or S is empty, False otherwise.");
12068
12069static PyObject*
12070unicode_isprintable(PyObject *self)
12071{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 Py_ssize_t i, length;
12073 int kind;
12074 void *data;
12075
12076 if (PyUnicode_READY(self) == -1)
12077 return NULL;
12078 length = PyUnicode_GET_LENGTH(self);
12079 kind = PyUnicode_KIND(self);
12080 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012081
12082 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012083 if (length == 1)
12084 return PyBool_FromLong(
12085 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012087 for (i = 0; i < length; i++) {
12088 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012089 Py_RETURN_FALSE;
12090 }
12091 }
12092 Py_RETURN_TRUE;
12093}
12094
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012095PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012096 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097\n\
12098Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012099iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100
12101static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012102unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012104 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105}
12106
Martin v. Löwis18e16552006-02-15 17:27:45 +000012107static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012108unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 if (PyUnicode_READY(self) == -1)
12111 return -1;
12112 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113}
12114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012115PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012116 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012118Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012119done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120
12121static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012122unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012124 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 Py_UCS4 fillchar = ' ';
12126
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012127 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128 return NULL;
12129
Benjamin Petersonbac79492012-01-14 13:34:47 -050012130 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012131 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132
Victor Stinnerc4b49542011-12-11 22:44:26 +010012133 if (PyUnicode_GET_LENGTH(self) >= width)
12134 return unicode_result_unchanged(self);
12135
12136 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137}
12138
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012139PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012140 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012142Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143
12144static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012145unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012147 if (PyUnicode_READY(self) == -1)
12148 return NULL;
12149 if (PyUnicode_IS_ASCII(self))
12150 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012151 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152}
12153
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012154#define LEFTSTRIP 0
12155#define RIGHTSTRIP 1
12156#define BOTHSTRIP 2
12157
12158/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012159static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012160
12161#define STRIPNAME(i) (stripformat[i]+3)
12162
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012163/* externally visible for str.strip(unicode) */
12164PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012165_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012166{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167 void *data;
12168 int kind;
12169 Py_ssize_t i, j, len;
12170 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012171 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12174 return NULL;
12175
12176 kind = PyUnicode_KIND(self);
12177 data = PyUnicode_DATA(self);
12178 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012179 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012180 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12181 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012182 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012183
Benjamin Peterson14339b62009-01-31 16:36:08 +000012184 i = 0;
12185 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012186 while (i < len) {
12187 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12188 if (!BLOOM(sepmask, ch))
12189 break;
12190 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12191 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012192 i++;
12193 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012194 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012195
Benjamin Peterson14339b62009-01-31 16:36:08 +000012196 j = len;
12197 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012198 j--;
12199 while (j >= i) {
12200 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12201 if (!BLOOM(sepmask, ch))
12202 break;
12203 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12204 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012205 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012206 }
12207
Benjamin Peterson29060642009-01-31 22:14:21 +000012208 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012209 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012210
Victor Stinner7931d9a2011-11-04 00:22:48 +010012211 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212}
12213
12214PyObject*
12215PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12216{
12217 unsigned char *data;
12218 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012219 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220
Victor Stinnerde636f32011-10-01 03:55:54 +020012221 if (PyUnicode_READY(self) == -1)
12222 return NULL;
12223
Victor Stinner684d5fd2012-05-03 02:32:34 +020012224 length = PyUnicode_GET_LENGTH(self);
12225 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012226
Victor Stinner684d5fd2012-05-03 02:32:34 +020012227 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012228 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229
Victor Stinnerde636f32011-10-01 03:55:54 +020012230 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012231 PyErr_SetString(PyExc_IndexError, "string index out of range");
12232 return NULL;
12233 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012234 if (start >= length || end < start)
12235 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012236
Victor Stinner684d5fd2012-05-03 02:32:34 +020012237 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012238 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012239 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012240 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012241 }
12242 else {
12243 kind = PyUnicode_KIND(self);
12244 data = PyUnicode_1BYTE_DATA(self);
12245 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012246 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012247 length);
12248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250
12251static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012252do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254 Py_ssize_t len, i, j;
12255
12256 if (PyUnicode_READY(self) == -1)
12257 return NULL;
12258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012260
Victor Stinnercc7af722013-04-09 22:39:24 +020012261 if (PyUnicode_IS_ASCII(self)) {
12262 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12263
12264 i = 0;
12265 if (striptype != RIGHTSTRIP) {
12266 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012267 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012268 if (!_Py_ascii_whitespace[ch])
12269 break;
12270 i++;
12271 }
12272 }
12273
12274 j = len;
12275 if (striptype != LEFTSTRIP) {
12276 j--;
12277 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012278 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012279 if (!_Py_ascii_whitespace[ch])
12280 break;
12281 j--;
12282 }
12283 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012284 }
12285 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012286 else {
12287 int kind = PyUnicode_KIND(self);
12288 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012289
Victor Stinnercc7af722013-04-09 22:39:24 +020012290 i = 0;
12291 if (striptype != RIGHTSTRIP) {
12292 while (i < len) {
12293 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12294 if (!Py_UNICODE_ISSPACE(ch))
12295 break;
12296 i++;
12297 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012298 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012299
12300 j = len;
12301 if (striptype != LEFTSTRIP) {
12302 j--;
12303 while (j >= i) {
12304 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12305 if (!Py_UNICODE_ISSPACE(ch))
12306 break;
12307 j--;
12308 }
12309 j++;
12310 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012311 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012312
Victor Stinner7931d9a2011-11-04 00:22:48 +010012313 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012314}
12315
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012316
12317static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012318do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012319{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012320 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012321
Serhiy Storchakac6792272013-10-19 21:03:34 +030012322 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012323 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012324
Benjamin Peterson14339b62009-01-31 16:36:08 +000012325 if (sep != NULL && sep != Py_None) {
12326 if (PyUnicode_Check(sep))
12327 return _PyUnicode_XStrip(self, striptype, sep);
12328 else {
12329 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012330 "%s arg must be None or str",
12331 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012332 return NULL;
12333 }
12334 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012335
Benjamin Peterson14339b62009-01-31 16:36:08 +000012336 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012337}
12338
12339
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012340PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012341 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012342\n\
12343Return a copy of the string S with leading and trailing\n\
12344whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012345If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012346
12347static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012348unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012349{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012350 if (PyTuple_GET_SIZE(args) == 0)
12351 return do_strip(self, BOTHSTRIP); /* Common case */
12352 else
12353 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012354}
12355
12356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012357PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012358 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012359\n\
12360Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012361If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012362
12363static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012364unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012365{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012366 if (PyTuple_GET_SIZE(args) == 0)
12367 return do_strip(self, LEFTSTRIP); /* Common case */
12368 else
12369 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012370}
12371
12372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012373PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012374 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012375\n\
12376Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012377If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012378
12379static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012380unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012381{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012382 if (PyTuple_GET_SIZE(args) == 0)
12383 return do_strip(self, RIGHTSTRIP); /* Common case */
12384 else
12385 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012386}
12387
12388
Guido van Rossumd57fd912000-03-10 22:53:23 +000012389static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012390unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012391{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012392 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394
Serhiy Storchaka05997252013-01-26 12:14:02 +020012395 if (len < 1)
12396 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397
Victor Stinnerc4b49542011-12-11 22:44:26 +010012398 /* no repeat, return original string */
12399 if (len == 1)
12400 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012401
Benjamin Petersonbac79492012-01-14 13:34:47 -050012402 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012403 return NULL;
12404
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012405 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012406 PyErr_SetString(PyExc_OverflowError,
12407 "repeated string is too long");
12408 return NULL;
12409 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012411
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012412 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012413 if (!u)
12414 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012415 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 if (PyUnicode_GET_LENGTH(str) == 1) {
12418 const int kind = PyUnicode_KIND(str);
12419 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012420 if (kind == PyUnicode_1BYTE_KIND) {
12421 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012422 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012423 }
12424 else if (kind == PyUnicode_2BYTE_KIND) {
12425 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012426 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012427 ucs2[n] = fill_char;
12428 } else {
12429 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12430 assert(kind == PyUnicode_4BYTE_KIND);
12431 for (n = 0; n < len; ++n)
12432 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012433 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012434 }
12435 else {
12436 /* number of characters copied this far */
12437 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012438 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012440 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012442 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012444 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012445 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447 }
12448
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012449 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012450 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012451}
12452
Alexander Belopolsky40018472011-02-26 01:02:56 +000012453PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012454PyUnicode_Replace(PyObject *str,
12455 PyObject *substr,
12456 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012457 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012458{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012459 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12460 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012461 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012462 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463}
12464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012465PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012466 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467\n\
12468Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012469old replaced by new. If the optional argument count is\n\
12470given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012471
12472static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475 PyObject *str1;
12476 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012477 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012478
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012479 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012480 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012481 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012482 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012483 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012484}
12485
Alexander Belopolsky40018472011-02-26 01:02:56 +000012486static PyObject *
12487unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012488{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012489 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490 Py_ssize_t isize;
12491 Py_ssize_t osize, squote, dquote, i, o;
12492 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012493 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012497 return NULL;
12498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499 isize = PyUnicode_GET_LENGTH(unicode);
12500 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012502 /* Compute length of output, quote characters, and
12503 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012504 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 max = 127;
12506 squote = dquote = 0;
12507 ikind = PyUnicode_KIND(unicode);
12508 for (i = 0; i < isize; i++) {
12509 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012510 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012512 case '\'': squote++; break;
12513 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012515 incr = 2;
12516 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517 default:
12518 /* Fast-path ASCII */
12519 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012520 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012522 ;
12523 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012526 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012527 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012528 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012530 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012531 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012532 if (osize > PY_SSIZE_T_MAX - incr) {
12533 PyErr_SetString(PyExc_OverflowError,
12534 "string is too long to generate repr");
12535 return NULL;
12536 }
12537 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012538 }
12539
12540 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012541 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012542 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012543 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 if (dquote)
12545 /* Both squote and dquote present. Use squote,
12546 and escape them */
12547 osize += squote;
12548 else
12549 quote = '"';
12550 }
Victor Stinner55c08782013-04-14 18:45:39 +020012551 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552
12553 repr = PyUnicode_New(osize, max);
12554 if (repr == NULL)
12555 return NULL;
12556 okind = PyUnicode_KIND(repr);
12557 odata = PyUnicode_DATA(repr);
12558
12559 PyUnicode_WRITE(okind, odata, 0, quote);
12560 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012561 if (unchanged) {
12562 _PyUnicode_FastCopyCharacters(repr, 1,
12563 unicode, 0,
12564 isize);
12565 }
12566 else {
12567 for (i = 0, o = 1; i < isize; i++) {
12568 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569
Victor Stinner55c08782013-04-14 18:45:39 +020012570 /* Escape quotes and backslashes */
12571 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012572 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012573 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012574 continue;
12575 }
12576
12577 /* Map special whitespace to '\t', \n', '\r' */
12578 if (ch == '\t') {
12579 PyUnicode_WRITE(okind, odata, o++, '\\');
12580 PyUnicode_WRITE(okind, odata, o++, 't');
12581 }
12582 else if (ch == '\n') {
12583 PyUnicode_WRITE(okind, odata, o++, '\\');
12584 PyUnicode_WRITE(okind, odata, o++, 'n');
12585 }
12586 else if (ch == '\r') {
12587 PyUnicode_WRITE(okind, odata, o++, '\\');
12588 PyUnicode_WRITE(okind, odata, o++, 'r');
12589 }
12590
12591 /* Map non-printable US ASCII to '\xhh' */
12592 else if (ch < ' ' || ch == 0x7F) {
12593 PyUnicode_WRITE(okind, odata, o++, '\\');
12594 PyUnicode_WRITE(okind, odata, o++, 'x');
12595 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12596 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12597 }
12598
12599 /* Copy ASCII characters as-is */
12600 else if (ch < 0x7F) {
12601 PyUnicode_WRITE(okind, odata, o++, ch);
12602 }
12603
12604 /* Non-ASCII characters */
12605 else {
12606 /* Map Unicode whitespace and control characters
12607 (categories Z* and C* except ASCII space)
12608 */
12609 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12610 PyUnicode_WRITE(okind, odata, o++, '\\');
12611 /* Map 8-bit characters to '\xhh' */
12612 if (ch <= 0xff) {
12613 PyUnicode_WRITE(okind, odata, o++, 'x');
12614 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12615 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12616 }
12617 /* Map 16-bit characters to '\uxxxx' */
12618 else if (ch <= 0xffff) {
12619 PyUnicode_WRITE(okind, odata, o++, 'u');
12620 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12621 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12622 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12623 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12624 }
12625 /* Map 21-bit characters to '\U00xxxxxx' */
12626 else {
12627 PyUnicode_WRITE(okind, odata, o++, 'U');
12628 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12629 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12630 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12631 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12632 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12633 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12634 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12635 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12636 }
12637 }
12638 /* Copy characters as-is */
12639 else {
12640 PyUnicode_WRITE(okind, odata, o++, ch);
12641 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012642 }
12643 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012644 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012646 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012647 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648}
12649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012650PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012651 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652\n\
12653Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012654such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655arguments start and end are interpreted as in slice notation.\n\
12656\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012657Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658
12659static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012662 /* initialize variables to prevent gcc warning */
12663 PyObject *substring = NULL;
12664 Py_ssize_t start = 0;
12665 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012666 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012668 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012669 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012671 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012674 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 if (result == -2)
12677 return NULL;
12678
Christian Heimes217cfd12007-12-02 14:31:20 +000012679 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680}
12681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012682PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012683 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012685Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686
12687static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012690 /* initialize variables to prevent gcc warning */
12691 PyObject *substring = NULL;
12692 Py_ssize_t start = 0;
12693 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012694 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012695
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012696 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012697 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012699 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012702 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 if (result == -2)
12705 return NULL;
12706
Guido van Rossumd57fd912000-03-10 22:53:23 +000012707 if (result < 0) {
12708 PyErr_SetString(PyExc_ValueError, "substring not found");
12709 return NULL;
12710 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711
Christian Heimes217cfd12007-12-02 14:31:20 +000012712 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713}
12714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012715PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012716 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012717\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012718Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012719done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720
12721static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012722unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012724 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 Py_UCS4 fillchar = ' ';
12726
Victor Stinnere9a29352011-10-01 02:14:59 +020012727 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012729
Benjamin Petersonbac79492012-01-14 13:34:47 -050012730 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012731 return NULL;
12732
Victor Stinnerc4b49542011-12-11 22:44:26 +010012733 if (PyUnicode_GET_LENGTH(self) >= width)
12734 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012735
Victor Stinnerc4b49542011-12-11 22:44:26 +010012736 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737}
12738
Alexander Belopolsky40018472011-02-26 01:02:56 +000012739PyObject *
12740PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012741{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012742 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012744
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012745 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012746}
12747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012748PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012749 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012750\n\
12751Return a list of the words in S, using sep as the\n\
12752delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012753splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012754whitespace string is a separator and empty strings are\n\
12755removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012756
12757static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012758unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012759{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012760 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012761 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012762 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012763
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012764 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12765 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012766 return NULL;
12767
12768 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012769 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012770
12771 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012772 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012773
12774 PyErr_Format(PyExc_TypeError,
12775 "must be str or None, not %.100s",
12776 Py_TYPE(substring)->tp_name);
12777 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012778}
12779
Thomas Wouters477c8d52006-05-27 19:21:47 +000012780PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012781PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012782{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012783 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012784 int kind1, kind2;
12785 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012786 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012787
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012788 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012789 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012790
Victor Stinner14f8f022011-10-05 20:58:25 +020012791 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012792 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012793 len1 = PyUnicode_GET_LENGTH(str_obj);
12794 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012795 if (kind1 < kind2 || len1 < len2) {
12796 _Py_INCREF_UNICODE_EMPTY();
12797 if (!unicode_empty)
12798 out = NULL;
12799 else {
12800 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12801 Py_DECREF(unicode_empty);
12802 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012803 return out;
12804 }
12805 buf1 = PyUnicode_DATA(str_obj);
12806 buf2 = PyUnicode_DATA(sep_obj);
12807 if (kind2 != kind1) {
12808 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12809 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012810 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012811 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012813 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012814 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012815 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12816 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12817 else
12818 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819 break;
12820 case PyUnicode_2BYTE_KIND:
12821 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12822 break;
12823 case PyUnicode_4BYTE_KIND:
12824 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12825 break;
12826 default:
12827 assert(0);
12828 out = 0;
12829 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012830
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012831 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012832 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012833
12834 return out;
12835}
12836
12837
12838PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012839PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012840{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012841 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012842 int kind1, kind2;
12843 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012845
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012846 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012847 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012848
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012849 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012850 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 len1 = PyUnicode_GET_LENGTH(str_obj);
12852 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012853 if (kind1 < kind2 || len1 < len2) {
12854 _Py_INCREF_UNICODE_EMPTY();
12855 if (!unicode_empty)
12856 out = NULL;
12857 else {
12858 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12859 Py_DECREF(unicode_empty);
12860 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012861 return out;
12862 }
12863 buf1 = PyUnicode_DATA(str_obj);
12864 buf2 = PyUnicode_DATA(sep_obj);
12865 if (kind2 != kind1) {
12866 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12867 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012868 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012869 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012870
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012871 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012872 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012873 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12874 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12875 else
12876 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012877 break;
12878 case PyUnicode_2BYTE_KIND:
12879 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12880 break;
12881 case PyUnicode_4BYTE_KIND:
12882 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12883 break;
12884 default:
12885 assert(0);
12886 out = 0;
12887 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012888
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012889 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012890 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012891
12892 return out;
12893}
12894
12895PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012896 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012897\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012898Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012899the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012900found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012901
12902static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012903unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012904{
Victor Stinner9310abb2011-10-05 00:59:23 +020012905 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012906}
12907
12908PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012909 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012910\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012911Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012912the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012913separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012914
12915static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012916unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012917{
Victor Stinner9310abb2011-10-05 00:59:23 +020012918 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012919}
12920
Alexander Belopolsky40018472011-02-26 01:02:56 +000012921PyObject *
12922PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012923{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012924 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012925 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012926
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012927 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012928}
12929
12930PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012931 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012932\n\
12933Return a list of the words in S, using sep as the\n\
12934delimiter string, starting at the end of the string and\n\
12935working to the front. If maxsplit is given, at most maxsplit\n\
12936splits are done. If sep is not specified, any whitespace string\n\
12937is a separator.");
12938
12939static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012940unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012941{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012942 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012943 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012944 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012945
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012946 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12947 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012948 return NULL;
12949
12950 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012951 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012952
12953 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012954 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012955
12956 PyErr_Format(PyExc_TypeError,
12957 "must be str or None, not %.100s",
12958 Py_TYPE(substring)->tp_name);
12959 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012960}
12961
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012962PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012963 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012964\n\
12965Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012966Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012967is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012968
12969static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012970unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012971{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012972 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012973 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012974
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012975 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12976 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012977 return NULL;
12978
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012979 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012980}
12981
12982static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012983PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012985 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986}
12987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012988PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012989 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012990\n\
12991Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012992and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012993
12994static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012995unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012996{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012997 if (PyUnicode_READY(self) == -1)
12998 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012999 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013000}
13001
Larry Hastings61272b72014-01-07 12:41:53 -080013002/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013003
Larry Hastings31826802013-10-19 00:09:25 -070013004@staticmethod
13005str.maketrans as unicode_maketrans
13006
13007 x: object
13008
13009 y: unicode=NULL
13010
13011 z: unicode=NULL
13012
13013 /
13014
13015Return a translation table usable for str.translate().
13016
13017If there is only one argument, it must be a dictionary mapping Unicode
13018ordinals (integers) or characters to Unicode ordinals, strings or None.
13019Character keys will be then converted to ordinals.
13020If there are two arguments, they must be strings of equal length, and
13021in the resulting dictionary, each character in x will be mapped to the
13022character at the same position in y. If there is a third argument, it
13023must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013024[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013025
Larry Hastings31826802013-10-19 00:09:25 -070013026static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013027unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013028/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013029{
Georg Brandlceee0772007-11-27 23:48:05 +000013030 PyObject *new = NULL, *key, *value;
13031 Py_ssize_t i = 0;
13032 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013033
Georg Brandlceee0772007-11-27 23:48:05 +000013034 new = PyDict_New();
13035 if (!new)
13036 return NULL;
13037 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013038 int x_kind, y_kind, z_kind;
13039 void *x_data, *y_data, *z_data;
13040
Georg Brandlceee0772007-11-27 23:48:05 +000013041 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013042 if (!PyUnicode_Check(x)) {
13043 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13044 "be a string if there is a second argument");
13045 goto err;
13046 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013047 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013048 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13049 "arguments must have equal length");
13050 goto err;
13051 }
13052 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013053 x_kind = PyUnicode_KIND(x);
13054 y_kind = PyUnicode_KIND(y);
13055 x_data = PyUnicode_DATA(x);
13056 y_data = PyUnicode_DATA(y);
13057 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13058 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013059 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013060 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013061 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013062 if (!value) {
13063 Py_DECREF(key);
13064 goto err;
13065 }
Georg Brandlceee0772007-11-27 23:48:05 +000013066 res = PyDict_SetItem(new, key, value);
13067 Py_DECREF(key);
13068 Py_DECREF(value);
13069 if (res < 0)
13070 goto err;
13071 }
13072 /* create entries for deleting chars in z */
13073 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013074 z_kind = PyUnicode_KIND(z);
13075 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013076 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013077 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013078 if (!key)
13079 goto err;
13080 res = PyDict_SetItem(new, key, Py_None);
13081 Py_DECREF(key);
13082 if (res < 0)
13083 goto err;
13084 }
13085 }
13086 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013087 int kind;
13088 void *data;
13089
Georg Brandlceee0772007-11-27 23:48:05 +000013090 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013091 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013092 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13093 "to maketrans it must be a dict");
13094 goto err;
13095 }
13096 /* copy entries into the new dict, converting string keys to int keys */
13097 while (PyDict_Next(x, &i, &key, &value)) {
13098 if (PyUnicode_Check(key)) {
13099 /* convert string keys to integer keys */
13100 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013101 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013102 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13103 "table must be of length 1");
13104 goto err;
13105 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013106 kind = PyUnicode_KIND(key);
13107 data = PyUnicode_DATA(key);
13108 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013109 if (!newkey)
13110 goto err;
13111 res = PyDict_SetItem(new, newkey, value);
13112 Py_DECREF(newkey);
13113 if (res < 0)
13114 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013115 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013116 /* just keep integer keys */
13117 if (PyDict_SetItem(new, key, value) < 0)
13118 goto err;
13119 } else {
13120 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13121 "be strings or integers");
13122 goto err;
13123 }
13124 }
13125 }
13126 return new;
13127 err:
13128 Py_DECREF(new);
13129 return NULL;
13130}
13131
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013132PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013133 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013134\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013135Return a copy of the string S in which each character has been mapped\n\
13136through the given translation table. The table must implement\n\
13137lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13138mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13139this operation raises LookupError, the character is left untouched.\n\
13140Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141
13142static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013143unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013145 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013146}
13147
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013148PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013149 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013151Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013152
13153static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013154unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013156 if (PyUnicode_READY(self) == -1)
13157 return NULL;
13158 if (PyUnicode_IS_ASCII(self))
13159 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013160 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013161}
13162
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013163PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013164 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013165\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013166Pad a numeric string S with zeros on the left, to fill a field\n\
13167of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013168
13169static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013170unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013171{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013172 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013173 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013174 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013175 int kind;
13176 void *data;
13177 Py_UCS4 chr;
13178
Martin v. Löwis18e16552006-02-15 17:27:45 +000013179 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013180 return NULL;
13181
Benjamin Petersonbac79492012-01-14 13:34:47 -050013182 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013183 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013184
Victor Stinnerc4b49542011-12-11 22:44:26 +010013185 if (PyUnicode_GET_LENGTH(self) >= width)
13186 return unicode_result_unchanged(self);
13187
13188 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189
13190 u = pad(self, fill, 0, '0');
13191
Walter Dörwald068325e2002-04-15 13:36:47 +000013192 if (u == NULL)
13193 return NULL;
13194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013195 kind = PyUnicode_KIND(u);
13196 data = PyUnicode_DATA(u);
13197 chr = PyUnicode_READ(kind, data, fill);
13198
13199 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013200 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013201 PyUnicode_WRITE(kind, data, 0, chr);
13202 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013203 }
13204
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013205 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013206 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013208
13209#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013210static PyObject *
13211unicode__decimal2ascii(PyObject *self)
13212{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013213 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013214}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215#endif
13216
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013217PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013218 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013219\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013220Return True if S starts with the specified prefix, False otherwise.\n\
13221With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013222With optional end, stop comparing S at that position.\n\
13223prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013224
13225static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013226unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013227 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013228{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013229 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013230 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013231 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013232 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013233 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013234
Jesus Ceaac451502011-04-20 17:09:23 +020013235 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013236 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013237 if (PyTuple_Check(subobj)) {
13238 Py_ssize_t i;
13239 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013240 substring = PyTuple_GET_ITEM(subobj, i);
13241 if (!PyUnicode_Check(substring)) {
13242 PyErr_Format(PyExc_TypeError,
13243 "tuple for startswith must only contain str, "
13244 "not %.100s",
13245 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013246 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013247 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013248 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013249 if (result == -1)
13250 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013251 if (result) {
13252 Py_RETURN_TRUE;
13253 }
13254 }
13255 /* nothing matched */
13256 Py_RETURN_FALSE;
13257 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013258 if (!PyUnicode_Check(subobj)) {
13259 PyErr_Format(PyExc_TypeError,
13260 "startswith first arg must be str or "
13261 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013262 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013263 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013264 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013265 if (result == -1)
13266 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013267 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013268}
13269
13270
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013271PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013272 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013273\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013274Return True if S ends with the specified suffix, False otherwise.\n\
13275With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013276With optional end, stop comparing S at that position.\n\
13277suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278
13279static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013280unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013281 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013282{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013283 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013284 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013285 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013286 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013287 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013288
Jesus Ceaac451502011-04-20 17:09:23 +020013289 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013290 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013291 if (PyTuple_Check(subobj)) {
13292 Py_ssize_t i;
13293 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013294 substring = PyTuple_GET_ITEM(subobj, i);
13295 if (!PyUnicode_Check(substring)) {
13296 PyErr_Format(PyExc_TypeError,
13297 "tuple for endswith must only contain str, "
13298 "not %.100s",
13299 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013300 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013301 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013302 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013303 if (result == -1)
13304 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013305 if (result) {
13306 Py_RETURN_TRUE;
13307 }
13308 }
13309 Py_RETURN_FALSE;
13310 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013311 if (!PyUnicode_Check(subobj)) {
13312 PyErr_Format(PyExc_TypeError,
13313 "endswith first arg must be str or "
13314 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013315 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013316 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013317 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013318 if (result == -1)
13319 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013320 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013321}
13322
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013323static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013324_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013325{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013326 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13327 writer->data = PyUnicode_DATA(writer->buffer);
13328
13329 if (!writer->readonly) {
13330 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013331 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013332 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013333 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013334 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13335 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13336 writer->kind = PyUnicode_WCHAR_KIND;
13337 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13338
Victor Stinner8f674cc2013-04-17 23:02:17 +020013339 /* Copy-on-write mode: set buffer size to 0 so
13340 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13341 * next write. */
13342 writer->size = 0;
13343 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013344}
13345
Victor Stinnerd3f08822012-05-29 12:57:52 +020013346void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013347_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013348{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013349 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013350
13351 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013352 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013353
13354 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13355 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13356 writer->kind = PyUnicode_WCHAR_KIND;
13357 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013358}
13359
Victor Stinnerd3f08822012-05-29 12:57:52 +020013360int
13361_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13362 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013363{
13364 Py_ssize_t newlen;
13365 PyObject *newbuffer;
13366
Victor Stinner2740e462016-09-06 16:58:36 -070013367 assert(maxchar <= MAX_UNICODE);
13368
Victor Stinnerca9381e2015-09-22 00:58:32 +020013369 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013370 assert((maxchar > writer->maxchar && length >= 0)
13371 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013372
Victor Stinner202fdca2012-05-07 12:47:02 +020013373 if (length > PY_SSIZE_T_MAX - writer->pos) {
13374 PyErr_NoMemory();
13375 return -1;
13376 }
13377 newlen = writer->pos + length;
13378
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013379 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013380
Victor Stinnerd3f08822012-05-29 12:57:52 +020013381 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013382 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013383 if (writer->overallocate
13384 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13385 /* overallocate to limit the number of realloc() */
13386 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013387 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013388 if (newlen < writer->min_length)
13389 newlen = writer->min_length;
13390
Victor Stinnerd3f08822012-05-29 12:57:52 +020013391 writer->buffer = PyUnicode_New(newlen, maxchar);
13392 if (writer->buffer == NULL)
13393 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013394 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013395 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013396 if (writer->overallocate
13397 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13398 /* overallocate to limit the number of realloc() */
13399 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013400 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013401 if (newlen < writer->min_length)
13402 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013403
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013404 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013405 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013406 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013407 newbuffer = PyUnicode_New(newlen, maxchar);
13408 if (newbuffer == NULL)
13409 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013410 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13411 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013412 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013413 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013414 }
13415 else {
13416 newbuffer = resize_compact(writer->buffer, newlen);
13417 if (newbuffer == NULL)
13418 return -1;
13419 }
13420 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013421 }
13422 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013423 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013424 newbuffer = PyUnicode_New(writer->size, maxchar);
13425 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013426 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013427 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13428 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013429 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013430 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013431 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013432 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013433
13434#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013435}
13436
Victor Stinnerca9381e2015-09-22 00:58:32 +020013437int
13438_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13439 enum PyUnicode_Kind kind)
13440{
13441 Py_UCS4 maxchar;
13442
13443 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13444 assert(writer->kind < kind);
13445
13446 switch (kind)
13447 {
13448 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13449 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13450 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13451 default:
13452 assert(0 && "invalid kind");
13453 return -1;
13454 }
13455
13456 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13457}
13458
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013459static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013460_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013461{
Victor Stinner2740e462016-09-06 16:58:36 -070013462 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013463 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13464 return -1;
13465 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13466 writer->pos++;
13467 return 0;
13468}
13469
13470int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013471_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13472{
13473 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13474}
13475
13476int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013477_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13478{
13479 Py_UCS4 maxchar;
13480 Py_ssize_t len;
13481
13482 if (PyUnicode_READY(str) == -1)
13483 return -1;
13484 len = PyUnicode_GET_LENGTH(str);
13485 if (len == 0)
13486 return 0;
13487 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13488 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013489 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013490 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013491 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013492 Py_INCREF(str);
13493 writer->buffer = str;
13494 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013495 writer->pos += len;
13496 return 0;
13497 }
13498 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13499 return -1;
13500 }
13501 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13502 str, 0, len);
13503 writer->pos += len;
13504 return 0;
13505}
13506
Victor Stinnere215d962012-10-06 23:03:36 +020013507int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013508_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13509 Py_ssize_t start, Py_ssize_t end)
13510{
13511 Py_UCS4 maxchar;
13512 Py_ssize_t len;
13513
13514 if (PyUnicode_READY(str) == -1)
13515 return -1;
13516
13517 assert(0 <= start);
13518 assert(end <= PyUnicode_GET_LENGTH(str));
13519 assert(start <= end);
13520
13521 if (end == 0)
13522 return 0;
13523
13524 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13525 return _PyUnicodeWriter_WriteStr(writer, str);
13526
13527 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13528 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13529 else
13530 maxchar = writer->maxchar;
13531 len = end - start;
13532
13533 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13534 return -1;
13535
13536 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13537 str, start, len);
13538 writer->pos += len;
13539 return 0;
13540}
13541
13542int
Victor Stinner4a587072013-11-19 12:54:53 +010013543_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13544 const char *ascii, Py_ssize_t len)
13545{
13546 if (len == -1)
13547 len = strlen(ascii);
13548
13549 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13550
13551 if (writer->buffer == NULL && !writer->overallocate) {
13552 PyObject *str;
13553
13554 str = _PyUnicode_FromASCII(ascii, len);
13555 if (str == NULL)
13556 return -1;
13557
13558 writer->readonly = 1;
13559 writer->buffer = str;
13560 _PyUnicodeWriter_Update(writer);
13561 writer->pos += len;
13562 return 0;
13563 }
13564
13565 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13566 return -1;
13567
13568 switch (writer->kind)
13569 {
13570 case PyUnicode_1BYTE_KIND:
13571 {
13572 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13573 Py_UCS1 *data = writer->data;
13574
Christian Heimesf051e432016-09-13 20:22:02 +020013575 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013576 break;
13577 }
13578 case PyUnicode_2BYTE_KIND:
13579 {
13580 _PyUnicode_CONVERT_BYTES(
13581 Py_UCS1, Py_UCS2,
13582 ascii, ascii + len,
13583 (Py_UCS2 *)writer->data + writer->pos);
13584 break;
13585 }
13586 case PyUnicode_4BYTE_KIND:
13587 {
13588 _PyUnicode_CONVERT_BYTES(
13589 Py_UCS1, Py_UCS4,
13590 ascii, ascii + len,
13591 (Py_UCS4 *)writer->data + writer->pos);
13592 break;
13593 }
13594 default:
13595 assert(0);
13596 }
13597
13598 writer->pos += len;
13599 return 0;
13600}
13601
13602int
13603_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13604 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013605{
13606 Py_UCS4 maxchar;
13607
13608 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13609 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13610 return -1;
13611 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13612 writer->pos += len;
13613 return 0;
13614}
13615
Victor Stinnerd3f08822012-05-29 12:57:52 +020013616PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013617_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013618{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013619 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013620
Victor Stinnerd3f08822012-05-29 12:57:52 +020013621 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013622 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013623 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013624 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013625
13626 str = writer->buffer;
13627 writer->buffer = NULL;
13628
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013629 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013630 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13631 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013632 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013633
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013634 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13635 PyObject *str2;
13636 str2 = resize_compact(str, writer->pos);
13637 if (str2 == NULL) {
13638 Py_DECREF(str);
13639 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013640 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013641 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013642 }
13643
Victor Stinner15a0bd32013-07-08 22:29:55 +020013644 assert(_PyUnicode_CheckConsistency(str, 1));
13645 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013646}
13647
Victor Stinnerd3f08822012-05-29 12:57:52 +020013648void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013649_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013650{
13651 Py_CLEAR(writer->buffer);
13652}
13653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013654#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013655
13656PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013657 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013658\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013659Return a formatted version of S, using substitutions from args and kwargs.\n\
13660The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013661
Eric Smith27bbca62010-11-04 17:06:58 +000013662PyDoc_STRVAR(format_map__doc__,
13663 "S.format_map(mapping) -> str\n\
13664\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013665Return a formatted version of S, using substitutions from mapping.\n\
13666The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013667
Eric Smith4a7d76d2008-05-30 18:10:19 +000013668static PyObject *
13669unicode__format__(PyObject* self, PyObject* args)
13670{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013671 PyObject *format_spec;
13672 _PyUnicodeWriter writer;
13673 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013674
13675 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13676 return NULL;
13677
Victor Stinnerd3f08822012-05-29 12:57:52 +020013678 if (PyUnicode_READY(self) == -1)
13679 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013680 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013681 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13682 self, format_spec, 0,
13683 PyUnicode_GET_LENGTH(format_spec));
13684 if (ret == -1) {
13685 _PyUnicodeWriter_Dealloc(&writer);
13686 return NULL;
13687 }
13688 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013689}
13690
Eric Smith8c663262007-08-25 02:26:07 +000013691PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013692 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013693\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013694Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013695
13696static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013697unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013698{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013699 Py_ssize_t size;
13700
13701 /* If it's a compact object, account for base structure +
13702 character data. */
13703 if (PyUnicode_IS_COMPACT_ASCII(v))
13704 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13705 else if (PyUnicode_IS_COMPACT(v))
13706 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013707 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013708 else {
13709 /* If it is a two-block object, account for base object, and
13710 for character block if present. */
13711 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013712 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013713 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013714 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013715 }
13716 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013717 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013718 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013719 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013720 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013721 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013722
13723 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013724}
13725
13726PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013727 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013728
13729static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013730unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013731{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013732 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013733 if (!copy)
13734 return NULL;
13735 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013736}
13737
Guido van Rossumd57fd912000-03-10 22:53:23 +000013738static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013739 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013740 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013741 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13742 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013743 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13744 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013745 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013746 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13747 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13748 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013749 {"expandtabs", (PyCFunction) unicode_expandtabs,
13750 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013751 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013752 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013753 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13754 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13755 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013756 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013757 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13758 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13759 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013760 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013761 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013762 {"splitlines", (PyCFunction) unicode_splitlines,
13763 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013764 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013765 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13766 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13767 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13768 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13769 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13770 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13771 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13772 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13773 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13774 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13775 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13776 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13777 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13778 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013779 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013780 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013781 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013782 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013783 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013784 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013785 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013786 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013787#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013788 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013789 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013790#endif
13791
Benjamin Peterson14339b62009-01-31 16:36:08 +000013792 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013793 {NULL, NULL}
13794};
13795
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013796static PyObject *
13797unicode_mod(PyObject *v, PyObject *w)
13798{
Brian Curtindfc80e32011-08-10 20:28:54 -050013799 if (!PyUnicode_Check(v))
13800 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013801 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013802}
13803
13804static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013805 0, /*nb_add*/
13806 0, /*nb_subtract*/
13807 0, /*nb_multiply*/
13808 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013809};
13810
Guido van Rossumd57fd912000-03-10 22:53:23 +000013811static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013812 (lenfunc) unicode_length, /* sq_length */
13813 PyUnicode_Concat, /* sq_concat */
13814 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13815 (ssizeargfunc) unicode_getitem, /* sq_item */
13816 0, /* sq_slice */
13817 0, /* sq_ass_item */
13818 0, /* sq_ass_slice */
13819 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013820};
13821
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013822static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013823unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013824{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013825 if (PyUnicode_READY(self) == -1)
13826 return NULL;
13827
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013828 if (PyIndex_Check(item)) {
13829 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013830 if (i == -1 && PyErr_Occurred())
13831 return NULL;
13832 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013833 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013834 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013835 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013836 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013837 PyObject *result;
13838 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013839 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013840 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013842 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013843 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013844 return NULL;
13845 }
13846
13847 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013848 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013849 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013850 slicelength == PyUnicode_GET_LENGTH(self)) {
13851 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013852 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013853 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013854 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013855 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013856 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013857 src_kind = PyUnicode_KIND(self);
13858 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013859 if (!PyUnicode_IS_ASCII(self)) {
13860 kind_limit = kind_maxchar_limit(src_kind);
13861 max_char = 0;
13862 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13863 ch = PyUnicode_READ(src_kind, src_data, cur);
13864 if (ch > max_char) {
13865 max_char = ch;
13866 if (max_char >= kind_limit)
13867 break;
13868 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013869 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013870 }
Victor Stinner55c99112011-10-13 01:17:06 +020013871 else
13872 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013873 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013874 if (result == NULL)
13875 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013876 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013877 dest_data = PyUnicode_DATA(result);
13878
13879 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013880 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13881 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013882 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013883 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013884 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013885 } else {
13886 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13887 return NULL;
13888 }
13889}
13890
13891static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013892 (lenfunc)unicode_length, /* mp_length */
13893 (binaryfunc)unicode_subscript, /* mp_subscript */
13894 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013895};
13896
Guido van Rossumd57fd912000-03-10 22:53:23 +000013897
Guido van Rossumd57fd912000-03-10 22:53:23 +000013898/* Helpers for PyUnicode_Format() */
13899
Victor Stinnera47082312012-10-04 02:19:54 +020013900struct unicode_formatter_t {
13901 PyObject *args;
13902 int args_owned;
13903 Py_ssize_t arglen, argidx;
13904 PyObject *dict;
13905
13906 enum PyUnicode_Kind fmtkind;
13907 Py_ssize_t fmtcnt, fmtpos;
13908 void *fmtdata;
13909 PyObject *fmtstr;
13910
13911 _PyUnicodeWriter writer;
13912};
13913
13914struct unicode_format_arg_t {
13915 Py_UCS4 ch;
13916 int flags;
13917 Py_ssize_t width;
13918 int prec;
13919 int sign;
13920};
13921
Guido van Rossumd57fd912000-03-10 22:53:23 +000013922static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013923unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013924{
Victor Stinnera47082312012-10-04 02:19:54 +020013925 Py_ssize_t argidx = ctx->argidx;
13926
13927 if (argidx < ctx->arglen) {
13928 ctx->argidx++;
13929 if (ctx->arglen < 0)
13930 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013931 else
Victor Stinnera47082312012-10-04 02:19:54 +020013932 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013933 }
13934 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013935 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013936 return NULL;
13937}
13938
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013939/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013940
Victor Stinnera47082312012-10-04 02:19:54 +020013941/* Format a float into the writer if the writer is not NULL, or into *p_output
13942 otherwise.
13943
13944 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013945static int
Victor Stinnera47082312012-10-04 02:19:54 +020013946formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13947 PyObject **p_output,
13948 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013949{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013950 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013951 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013952 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013953 int prec;
13954 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013955
Guido van Rossumd57fd912000-03-10 22:53:23 +000013956 x = PyFloat_AsDouble(v);
13957 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013958 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013959
Victor Stinnera47082312012-10-04 02:19:54 +020013960 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013961 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013962 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013963
Victor Stinnera47082312012-10-04 02:19:54 +020013964 if (arg->flags & F_ALT)
13965 dtoa_flags = Py_DTSF_ALT;
13966 else
13967 dtoa_flags = 0;
13968 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013969 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013970 return -1;
13971 len = strlen(p);
13972 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013973 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013974 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013975 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013976 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013977 }
13978 else
13979 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013980 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013981 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013982}
13983
Victor Stinnerd0880d52012-04-27 23:40:13 +020013984/* formatlong() emulates the format codes d, u, o, x and X, and
13985 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13986 * Python's regular ints.
13987 * Return value: a new PyUnicodeObject*, or NULL if error.
13988 * The output string is of the form
13989 * "-"? ("0x" | "0X")? digit+
13990 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13991 * set in flags. The case of hex digits will be correct,
13992 * There will be at least prec digits, zero-filled on the left if
13993 * necessary to get that many.
13994 * val object to be converted
13995 * flags bitmask of format flags; only F_ALT is looked at
13996 * prec minimum number of digits; 0-fill on left if needed
13997 * type a character in [duoxX]; u acts the same as d
13998 *
13999 * CAUTION: o, x and X conversions on regular ints can never
14000 * produce a '-' sign, but can for Python's unbounded ints.
14001 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014002PyObject *
14003_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014004{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014005 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014006 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014007 Py_ssize_t i;
14008 int sign; /* 1 if '-', else 0 */
14009 int len; /* number of characters */
14010 Py_ssize_t llen;
14011 int numdigits; /* len == numnondigits + numdigits */
14012 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014013
Victor Stinnerd0880d52012-04-27 23:40:13 +020014014 /* Avoid exceeding SSIZE_T_MAX */
14015 if (prec > INT_MAX-3) {
14016 PyErr_SetString(PyExc_OverflowError,
14017 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014018 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014019 }
14020
14021 assert(PyLong_Check(val));
14022
14023 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014024 default:
14025 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014026 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014027 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014028 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014029 /* int and int subclasses should print numerically when a numeric */
14030 /* format code is used (see issue18780) */
14031 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014032 break;
14033 case 'o':
14034 numnondigits = 2;
14035 result = PyNumber_ToBase(val, 8);
14036 break;
14037 case 'x':
14038 case 'X':
14039 numnondigits = 2;
14040 result = PyNumber_ToBase(val, 16);
14041 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014042 }
14043 if (!result)
14044 return NULL;
14045
14046 assert(unicode_modifiable(result));
14047 assert(PyUnicode_IS_READY(result));
14048 assert(PyUnicode_IS_ASCII(result));
14049
14050 /* To modify the string in-place, there can only be one reference. */
14051 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014052 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014053 PyErr_BadInternalCall();
14054 return NULL;
14055 }
14056 buf = PyUnicode_DATA(result);
14057 llen = PyUnicode_GET_LENGTH(result);
14058 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014059 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014060 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014061 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014062 return NULL;
14063 }
14064 len = (int)llen;
14065 sign = buf[0] == '-';
14066 numnondigits += sign;
14067 numdigits = len - numnondigits;
14068 assert(numdigits > 0);
14069
14070 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014071 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014072 (type == 'o' || type == 'x' || type == 'X'))) {
14073 assert(buf[sign] == '0');
14074 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14075 buf[sign+1] == 'o');
14076 numnondigits -= 2;
14077 buf += 2;
14078 len -= 2;
14079 if (sign)
14080 buf[0] = '-';
14081 assert(len == numnondigits + numdigits);
14082 assert(numdigits > 0);
14083 }
14084
14085 /* Fill with leading zeroes to meet minimum width. */
14086 if (prec > numdigits) {
14087 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14088 numnondigits + prec);
14089 char *b1;
14090 if (!r1) {
14091 Py_DECREF(result);
14092 return NULL;
14093 }
14094 b1 = PyBytes_AS_STRING(r1);
14095 for (i = 0; i < numnondigits; ++i)
14096 *b1++ = *buf++;
14097 for (i = 0; i < prec - numdigits; i++)
14098 *b1++ = '0';
14099 for (i = 0; i < numdigits; i++)
14100 *b1++ = *buf++;
14101 *b1 = '\0';
14102 Py_DECREF(result);
14103 result = r1;
14104 buf = PyBytes_AS_STRING(result);
14105 len = numnondigits + prec;
14106 }
14107
14108 /* Fix up case for hex conversions. */
14109 if (type == 'X') {
14110 /* Need to convert all lower case letters to upper case.
14111 and need to convert 0x to 0X (and -0x to -0X). */
14112 for (i = 0; i < len; i++)
14113 if (buf[i] >= 'a' && buf[i] <= 'x')
14114 buf[i] -= 'a'-'A';
14115 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014116 if (!PyUnicode_Check(result)
14117 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014118 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014119 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014120 Py_DECREF(result);
14121 result = unicode;
14122 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014123 else if (len != PyUnicode_GET_LENGTH(result)) {
14124 if (PyUnicode_Resize(&result, len) < 0)
14125 Py_CLEAR(result);
14126 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014127 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014128}
14129
Ethan Furmandf3ed242014-01-05 06:50:30 -080014130/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014131 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014132 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014133 * -1 and raise an exception on error */
14134static int
Victor Stinnera47082312012-10-04 02:19:54 +020014135mainformatlong(PyObject *v,
14136 struct unicode_format_arg_t *arg,
14137 PyObject **p_output,
14138 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014139{
14140 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014141 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014142
14143 if (!PyNumber_Check(v))
14144 goto wrongtype;
14145
Ethan Furman9ab74802014-03-21 06:38:46 -070014146 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014147 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014148 if (type == 'o' || type == 'x' || type == 'X') {
14149 iobj = PyNumber_Index(v);
14150 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014151 if (PyErr_ExceptionMatches(PyExc_TypeError))
14152 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014153 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014154 }
14155 }
14156 else {
14157 iobj = PyNumber_Long(v);
14158 if (iobj == NULL ) {
14159 if (PyErr_ExceptionMatches(PyExc_TypeError))
14160 goto wrongtype;
14161 return -1;
14162 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014163 }
14164 assert(PyLong_Check(iobj));
14165 }
14166 else {
14167 iobj = v;
14168 Py_INCREF(iobj);
14169 }
14170
14171 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014172 && arg->width == -1 && arg->prec == -1
14173 && !(arg->flags & (F_SIGN | F_BLANK))
14174 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014175 {
14176 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014177 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014178 int base;
14179
Victor Stinnera47082312012-10-04 02:19:54 +020014180 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014181 {
14182 default:
14183 assert(0 && "'type' not in [diuoxX]");
14184 case 'd':
14185 case 'i':
14186 case 'u':
14187 base = 10;
14188 break;
14189 case 'o':
14190 base = 8;
14191 break;
14192 case 'x':
14193 case 'X':
14194 base = 16;
14195 break;
14196 }
14197
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014198 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14199 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014200 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014201 }
14202 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014203 return 1;
14204 }
14205
Ethan Furmanb95b5612015-01-23 20:05:18 -080014206 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014207 Py_DECREF(iobj);
14208 if (res == NULL)
14209 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014210 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014211 return 0;
14212
14213wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014214 switch(type)
14215 {
14216 case 'o':
14217 case 'x':
14218 case 'X':
14219 PyErr_Format(PyExc_TypeError,
14220 "%%%c format: an integer is required, "
14221 "not %.200s",
14222 type, Py_TYPE(v)->tp_name);
14223 break;
14224 default:
14225 PyErr_Format(PyExc_TypeError,
14226 "%%%c format: a number is required, "
14227 "not %.200s",
14228 type, Py_TYPE(v)->tp_name);
14229 break;
14230 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014231 return -1;
14232}
14233
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014234static Py_UCS4
14235formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014236{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014237 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014238 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014239 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014240 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014241 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014242 goto onError;
14243 }
14244 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014245 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014246 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014247 /* make sure number is a type of integer */
14248 if (!PyLong_Check(v)) {
14249 iobj = PyNumber_Index(v);
14250 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014251 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014252 }
14253 v = iobj;
14254 Py_DECREF(iobj);
14255 }
14256 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014257 x = PyLong_AsLong(v);
14258 if (x == -1 && PyErr_Occurred())
14259 goto onError;
14260
Victor Stinner8faf8212011-12-08 22:14:11 +010014261 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014262 PyErr_SetString(PyExc_OverflowError,
14263 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014264 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014265 }
14266
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014267 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014268 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014269
Benjamin Peterson29060642009-01-31 22:14:21 +000014270 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014271 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014272 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014273 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014274}
14275
Victor Stinnera47082312012-10-04 02:19:54 +020014276/* Parse options of an argument: flags, width, precision.
14277 Handle also "%(name)" syntax.
14278
14279 Return 0 if the argument has been formatted into arg->str.
14280 Return 1 if the argument has been written into ctx->writer,
14281 Raise an exception and return -1 on error. */
14282static int
14283unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14284 struct unicode_format_arg_t *arg)
14285{
14286#define FORMAT_READ(ctx) \
14287 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14288
14289 PyObject *v;
14290
Victor Stinnera47082312012-10-04 02:19:54 +020014291 if (arg->ch == '(') {
14292 /* Get argument value from a dictionary. Example: "%(name)s". */
14293 Py_ssize_t keystart;
14294 Py_ssize_t keylen;
14295 PyObject *key;
14296 int pcount = 1;
14297
14298 if (ctx->dict == NULL) {
14299 PyErr_SetString(PyExc_TypeError,
14300 "format requires a mapping");
14301 return -1;
14302 }
14303 ++ctx->fmtpos;
14304 --ctx->fmtcnt;
14305 keystart = ctx->fmtpos;
14306 /* Skip over balanced parentheses */
14307 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14308 arg->ch = FORMAT_READ(ctx);
14309 if (arg->ch == ')')
14310 --pcount;
14311 else if (arg->ch == '(')
14312 ++pcount;
14313 ctx->fmtpos++;
14314 }
14315 keylen = ctx->fmtpos - keystart - 1;
14316 if (ctx->fmtcnt < 0 || pcount > 0) {
14317 PyErr_SetString(PyExc_ValueError,
14318 "incomplete format key");
14319 return -1;
14320 }
14321 key = PyUnicode_Substring(ctx->fmtstr,
14322 keystart, keystart + keylen);
14323 if (key == NULL)
14324 return -1;
14325 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014326 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014327 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014328 }
14329 ctx->args = PyObject_GetItem(ctx->dict, key);
14330 Py_DECREF(key);
14331 if (ctx->args == NULL)
14332 return -1;
14333 ctx->args_owned = 1;
14334 ctx->arglen = -1;
14335 ctx->argidx = -2;
14336 }
14337
14338 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014339 while (--ctx->fmtcnt >= 0) {
14340 arg->ch = FORMAT_READ(ctx);
14341 ctx->fmtpos++;
14342 switch (arg->ch) {
14343 case '-': arg->flags |= F_LJUST; continue;
14344 case '+': arg->flags |= F_SIGN; continue;
14345 case ' ': arg->flags |= F_BLANK; continue;
14346 case '#': arg->flags |= F_ALT; continue;
14347 case '0': arg->flags |= F_ZERO; continue;
14348 }
14349 break;
14350 }
14351
14352 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014353 if (arg->ch == '*') {
14354 v = unicode_format_getnextarg(ctx);
14355 if (v == NULL)
14356 return -1;
14357 if (!PyLong_Check(v)) {
14358 PyErr_SetString(PyExc_TypeError,
14359 "* wants int");
14360 return -1;
14361 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014362 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014363 if (arg->width == -1 && PyErr_Occurred())
14364 return -1;
14365 if (arg->width < 0) {
14366 arg->flags |= F_LJUST;
14367 arg->width = -arg->width;
14368 }
14369 if (--ctx->fmtcnt >= 0) {
14370 arg->ch = FORMAT_READ(ctx);
14371 ctx->fmtpos++;
14372 }
14373 }
14374 else if (arg->ch >= '0' && arg->ch <= '9') {
14375 arg->width = arg->ch - '0';
14376 while (--ctx->fmtcnt >= 0) {
14377 arg->ch = FORMAT_READ(ctx);
14378 ctx->fmtpos++;
14379 if (arg->ch < '0' || arg->ch > '9')
14380 break;
14381 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14382 mixing signed and unsigned comparison. Since arg->ch is between
14383 '0' and '9', casting to int is safe. */
14384 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14385 PyErr_SetString(PyExc_ValueError,
14386 "width too big");
14387 return -1;
14388 }
14389 arg->width = arg->width*10 + (arg->ch - '0');
14390 }
14391 }
14392
14393 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014394 if (arg->ch == '.') {
14395 arg->prec = 0;
14396 if (--ctx->fmtcnt >= 0) {
14397 arg->ch = FORMAT_READ(ctx);
14398 ctx->fmtpos++;
14399 }
14400 if (arg->ch == '*') {
14401 v = unicode_format_getnextarg(ctx);
14402 if (v == NULL)
14403 return -1;
14404 if (!PyLong_Check(v)) {
14405 PyErr_SetString(PyExc_TypeError,
14406 "* wants int");
14407 return -1;
14408 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014409 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014410 if (arg->prec == -1 && PyErr_Occurred())
14411 return -1;
14412 if (arg->prec < 0)
14413 arg->prec = 0;
14414 if (--ctx->fmtcnt >= 0) {
14415 arg->ch = FORMAT_READ(ctx);
14416 ctx->fmtpos++;
14417 }
14418 }
14419 else if (arg->ch >= '0' && arg->ch <= '9') {
14420 arg->prec = arg->ch - '0';
14421 while (--ctx->fmtcnt >= 0) {
14422 arg->ch = FORMAT_READ(ctx);
14423 ctx->fmtpos++;
14424 if (arg->ch < '0' || arg->ch > '9')
14425 break;
14426 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14427 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014428 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014429 return -1;
14430 }
14431 arg->prec = arg->prec*10 + (arg->ch - '0');
14432 }
14433 }
14434 }
14435
14436 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14437 if (ctx->fmtcnt >= 0) {
14438 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14439 if (--ctx->fmtcnt >= 0) {
14440 arg->ch = FORMAT_READ(ctx);
14441 ctx->fmtpos++;
14442 }
14443 }
14444 }
14445 if (ctx->fmtcnt < 0) {
14446 PyErr_SetString(PyExc_ValueError,
14447 "incomplete format");
14448 return -1;
14449 }
14450 return 0;
14451
14452#undef FORMAT_READ
14453}
14454
14455/* Format one argument. Supported conversion specifiers:
14456
14457 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014458 - "i", "d", "u": int or float
14459 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014460 - "e", "E", "f", "F", "g", "G": float
14461 - "c": int or str (1 character)
14462
Victor Stinner8dbd4212012-12-04 09:30:24 +010014463 When possible, the output is written directly into the Unicode writer
14464 (ctx->writer). A string is created when padding is required.
14465
Victor Stinnera47082312012-10-04 02:19:54 +020014466 Return 0 if the argument has been formatted into *p_str,
14467 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014468 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014469static int
14470unicode_format_arg_format(struct unicode_formatter_t *ctx,
14471 struct unicode_format_arg_t *arg,
14472 PyObject **p_str)
14473{
14474 PyObject *v;
14475 _PyUnicodeWriter *writer = &ctx->writer;
14476
14477 if (ctx->fmtcnt == 0)
14478 ctx->writer.overallocate = 0;
14479
14480 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014481 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014482 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014483 return 1;
14484 }
14485
14486 v = unicode_format_getnextarg(ctx);
14487 if (v == NULL)
14488 return -1;
14489
Victor Stinnera47082312012-10-04 02:19:54 +020014490
14491 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014492 case 's':
14493 case 'r':
14494 case 'a':
14495 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14496 /* Fast path */
14497 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14498 return -1;
14499 return 1;
14500 }
14501
14502 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14503 *p_str = v;
14504 Py_INCREF(*p_str);
14505 }
14506 else {
14507 if (arg->ch == 's')
14508 *p_str = PyObject_Str(v);
14509 else if (arg->ch == 'r')
14510 *p_str = PyObject_Repr(v);
14511 else
14512 *p_str = PyObject_ASCII(v);
14513 }
14514 break;
14515
14516 case 'i':
14517 case 'd':
14518 case 'u':
14519 case 'o':
14520 case 'x':
14521 case 'X':
14522 {
14523 int ret = mainformatlong(v, arg, p_str, writer);
14524 if (ret != 0)
14525 return ret;
14526 arg->sign = 1;
14527 break;
14528 }
14529
14530 case 'e':
14531 case 'E':
14532 case 'f':
14533 case 'F':
14534 case 'g':
14535 case 'G':
14536 if (arg->width == -1 && arg->prec == -1
14537 && !(arg->flags & (F_SIGN | F_BLANK)))
14538 {
14539 /* Fast path */
14540 if (formatfloat(v, arg, NULL, writer) == -1)
14541 return -1;
14542 return 1;
14543 }
14544
14545 arg->sign = 1;
14546 if (formatfloat(v, arg, p_str, NULL) == -1)
14547 return -1;
14548 break;
14549
14550 case 'c':
14551 {
14552 Py_UCS4 ch = formatchar(v);
14553 if (ch == (Py_UCS4) -1)
14554 return -1;
14555 if (arg->width == -1 && arg->prec == -1) {
14556 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014557 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014558 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014559 return 1;
14560 }
14561 *p_str = PyUnicode_FromOrdinal(ch);
14562 break;
14563 }
14564
14565 default:
14566 PyErr_Format(PyExc_ValueError,
14567 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014568 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014569 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14570 (int)arg->ch,
14571 ctx->fmtpos - 1);
14572 return -1;
14573 }
14574 if (*p_str == NULL)
14575 return -1;
14576 assert (PyUnicode_Check(*p_str));
14577 return 0;
14578}
14579
14580static int
14581unicode_format_arg_output(struct unicode_formatter_t *ctx,
14582 struct unicode_format_arg_t *arg,
14583 PyObject *str)
14584{
14585 Py_ssize_t len;
14586 enum PyUnicode_Kind kind;
14587 void *pbuf;
14588 Py_ssize_t pindex;
14589 Py_UCS4 signchar;
14590 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014591 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014592 Py_ssize_t sublen;
14593 _PyUnicodeWriter *writer = &ctx->writer;
14594 Py_UCS4 fill;
14595
14596 fill = ' ';
14597 if (arg->sign && arg->flags & F_ZERO)
14598 fill = '0';
14599
14600 if (PyUnicode_READY(str) == -1)
14601 return -1;
14602
14603 len = PyUnicode_GET_LENGTH(str);
14604 if ((arg->width == -1 || arg->width <= len)
14605 && (arg->prec == -1 || arg->prec >= len)
14606 && !(arg->flags & (F_SIGN | F_BLANK)))
14607 {
14608 /* Fast path */
14609 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14610 return -1;
14611 return 0;
14612 }
14613
14614 /* Truncate the string for "s", "r" and "a" formats
14615 if the precision is set */
14616 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14617 if (arg->prec >= 0 && len > arg->prec)
14618 len = arg->prec;
14619 }
14620
14621 /* Adjust sign and width */
14622 kind = PyUnicode_KIND(str);
14623 pbuf = PyUnicode_DATA(str);
14624 pindex = 0;
14625 signchar = '\0';
14626 if (arg->sign) {
14627 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14628 if (ch == '-' || ch == '+') {
14629 signchar = ch;
14630 len--;
14631 pindex++;
14632 }
14633 else if (arg->flags & F_SIGN)
14634 signchar = '+';
14635 else if (arg->flags & F_BLANK)
14636 signchar = ' ';
14637 else
14638 arg->sign = 0;
14639 }
14640 if (arg->width < len)
14641 arg->width = len;
14642
14643 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014644 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014645 if (!(arg->flags & F_LJUST)) {
14646 if (arg->sign) {
14647 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014648 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014649 }
14650 else {
14651 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014652 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014653 }
14654 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014655 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14656 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014657 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014658 }
14659
Victor Stinnera47082312012-10-04 02:19:54 +020014660 buflen = arg->width;
14661 if (arg->sign && len == arg->width)
14662 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014663 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014664 return -1;
14665
14666 /* Write the sign if needed */
14667 if (arg->sign) {
14668 if (fill != ' ') {
14669 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14670 writer->pos += 1;
14671 }
14672 if (arg->width > len)
14673 arg->width--;
14674 }
14675
14676 /* Write the numeric prefix for "x", "X" and "o" formats
14677 if the alternate form is used.
14678 For example, write "0x" for the "%#x" format. */
14679 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14680 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14681 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14682 if (fill != ' ') {
14683 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14684 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14685 writer->pos += 2;
14686 pindex += 2;
14687 }
14688 arg->width -= 2;
14689 if (arg->width < 0)
14690 arg->width = 0;
14691 len -= 2;
14692 }
14693
14694 /* Pad left with the fill character if needed */
14695 if (arg->width > len && !(arg->flags & F_LJUST)) {
14696 sublen = arg->width - len;
14697 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14698 writer->pos += sublen;
14699 arg->width = len;
14700 }
14701
14702 /* If padding with spaces: write sign if needed and/or numeric prefix if
14703 the alternate form is used */
14704 if (fill == ' ') {
14705 if (arg->sign) {
14706 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14707 writer->pos += 1;
14708 }
14709 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14710 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14711 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14712 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14713 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14714 writer->pos += 2;
14715 pindex += 2;
14716 }
14717 }
14718
14719 /* Write characters */
14720 if (len) {
14721 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14722 str, pindex, len);
14723 writer->pos += len;
14724 }
14725
14726 /* Pad right with the fill character if needed */
14727 if (arg->width > len) {
14728 sublen = arg->width - len;
14729 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14730 writer->pos += sublen;
14731 }
14732 return 0;
14733}
14734
14735/* Helper of PyUnicode_Format(): format one arg.
14736 Return 0 on success, raise an exception and return -1 on error. */
14737static int
14738unicode_format_arg(struct unicode_formatter_t *ctx)
14739{
14740 struct unicode_format_arg_t arg;
14741 PyObject *str;
14742 int ret;
14743
Victor Stinner8dbd4212012-12-04 09:30:24 +010014744 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14745 arg.flags = 0;
14746 arg.width = -1;
14747 arg.prec = -1;
14748 arg.sign = 0;
14749 str = NULL;
14750
Victor Stinnera47082312012-10-04 02:19:54 +020014751 ret = unicode_format_arg_parse(ctx, &arg);
14752 if (ret == -1)
14753 return -1;
14754
14755 ret = unicode_format_arg_format(ctx, &arg, &str);
14756 if (ret == -1)
14757 return -1;
14758
14759 if (ret != 1) {
14760 ret = unicode_format_arg_output(ctx, &arg, str);
14761 Py_DECREF(str);
14762 if (ret == -1)
14763 return -1;
14764 }
14765
14766 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14767 PyErr_SetString(PyExc_TypeError,
14768 "not all arguments converted during string formatting");
14769 return -1;
14770 }
14771 return 0;
14772}
14773
Alexander Belopolsky40018472011-02-26 01:02:56 +000014774PyObject *
14775PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014776{
Victor Stinnera47082312012-10-04 02:19:54 +020014777 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014778
Guido van Rossumd57fd912000-03-10 22:53:23 +000014779 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014780 PyErr_BadInternalCall();
14781 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014782 }
Victor Stinnera47082312012-10-04 02:19:54 +020014783
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014784 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014785 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014786
14787 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014788 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14789 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14790 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14791 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014792
Victor Stinner8f674cc2013-04-17 23:02:17 +020014793 _PyUnicodeWriter_Init(&ctx.writer);
14794 ctx.writer.min_length = ctx.fmtcnt + 100;
14795 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014796
Guido van Rossumd57fd912000-03-10 22:53:23 +000014797 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014798 ctx.arglen = PyTuple_Size(args);
14799 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014800 }
14801 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014802 ctx.arglen = -1;
14803 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014804 }
Victor Stinnera47082312012-10-04 02:19:54 +020014805 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014806 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014807 ctx.dict = args;
14808 else
14809 ctx.dict = NULL;
14810 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014811
Victor Stinnera47082312012-10-04 02:19:54 +020014812 while (--ctx.fmtcnt >= 0) {
14813 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014814 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014815
14816 nonfmtpos = ctx.fmtpos++;
14817 while (ctx.fmtcnt >= 0 &&
14818 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14819 ctx.fmtpos++;
14820 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014821 }
Victor Stinnera47082312012-10-04 02:19:54 +020014822 if (ctx.fmtcnt < 0) {
14823 ctx.fmtpos--;
14824 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014825 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014826
Victor Stinnercfc4c132013-04-03 01:48:39 +020014827 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14828 nonfmtpos, ctx.fmtpos) < 0)
14829 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014830 }
14831 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014832 ctx.fmtpos++;
14833 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014834 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014835 }
14836 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014837
Victor Stinnera47082312012-10-04 02:19:54 +020014838 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014839 PyErr_SetString(PyExc_TypeError,
14840 "not all arguments converted during string formatting");
14841 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014842 }
14843
Victor Stinnera47082312012-10-04 02:19:54 +020014844 if (ctx.args_owned) {
14845 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014846 }
Victor Stinnera47082312012-10-04 02:19:54 +020014847 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014848
Benjamin Peterson29060642009-01-31 22:14:21 +000014849 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014850 _PyUnicodeWriter_Dealloc(&ctx.writer);
14851 if (ctx.args_owned) {
14852 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014853 }
14854 return NULL;
14855}
14856
Jeremy Hylton938ace62002-07-17 16:30:39 +000014857static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014858unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14859
Tim Peters6d6c1a32001-08-02 04:15:00 +000014860static PyObject *
14861unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14862{
Benjamin Peterson29060642009-01-31 22:14:21 +000014863 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014864 static char *kwlist[] = {"object", "encoding", "errors", 0};
14865 char *encoding = NULL;
14866 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014867
Benjamin Peterson14339b62009-01-31 16:36:08 +000014868 if (type != &PyUnicode_Type)
14869 return unicode_subtype_new(type, args, kwds);
14870 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014871 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014872 return NULL;
14873 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014874 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014875 if (encoding == NULL && errors == NULL)
14876 return PyObject_Str(x);
14877 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014878 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014879}
14880
Guido van Rossume023fe02001-08-30 03:12:59 +000014881static PyObject *
14882unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14883{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014884 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014885 Py_ssize_t length, char_size;
14886 int share_wstr, share_utf8;
14887 unsigned int kind;
14888 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014889
Benjamin Peterson14339b62009-01-31 16:36:08 +000014890 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014891
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014892 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014893 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014894 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014895 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014896 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014897 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014898 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014899 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014900
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014901 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014902 if (self == NULL) {
14903 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014904 return NULL;
14905 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014906 kind = PyUnicode_KIND(unicode);
14907 length = PyUnicode_GET_LENGTH(unicode);
14908
14909 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014910#ifdef Py_DEBUG
14911 _PyUnicode_HASH(self) = -1;
14912#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014913 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014914#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014915 _PyUnicode_STATE(self).interned = 0;
14916 _PyUnicode_STATE(self).kind = kind;
14917 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014918 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014919 _PyUnicode_STATE(self).ready = 1;
14920 _PyUnicode_WSTR(self) = NULL;
14921 _PyUnicode_UTF8_LENGTH(self) = 0;
14922 _PyUnicode_UTF8(self) = NULL;
14923 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014924 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014925
14926 share_utf8 = 0;
14927 share_wstr = 0;
14928 if (kind == PyUnicode_1BYTE_KIND) {
14929 char_size = 1;
14930 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14931 share_utf8 = 1;
14932 }
14933 else if (kind == PyUnicode_2BYTE_KIND) {
14934 char_size = 2;
14935 if (sizeof(wchar_t) == 2)
14936 share_wstr = 1;
14937 }
14938 else {
14939 assert(kind == PyUnicode_4BYTE_KIND);
14940 char_size = 4;
14941 if (sizeof(wchar_t) == 4)
14942 share_wstr = 1;
14943 }
14944
14945 /* Ensure we won't overflow the length. */
14946 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14947 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014948 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014949 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014950 data = PyObject_MALLOC((length + 1) * char_size);
14951 if (data == NULL) {
14952 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014953 goto onError;
14954 }
14955
Victor Stinnerc3c74152011-10-02 20:39:55 +020014956 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014957 if (share_utf8) {
14958 _PyUnicode_UTF8_LENGTH(self) = length;
14959 _PyUnicode_UTF8(self) = data;
14960 }
14961 if (share_wstr) {
14962 _PyUnicode_WSTR_LENGTH(self) = length;
14963 _PyUnicode_WSTR(self) = (wchar_t *)data;
14964 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014965
Christian Heimesf051e432016-09-13 20:22:02 +020014966 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014967 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014968 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014969#ifdef Py_DEBUG
14970 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14971#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014972 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014973 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014974
14975onError:
14976 Py_DECREF(unicode);
14977 Py_DECREF(self);
14978 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014979}
14980
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014981PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014982"str(object='') -> str\n\
14983str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014984\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014985Create a new string object from the given object. If encoding or\n\
14986errors is specified, then the object must expose a data buffer\n\
14987that will be decoded using the given encoding and error handler.\n\
14988Otherwise, returns the result of object.__str__() (if defined)\n\
14989or repr(object).\n\
14990encoding defaults to sys.getdefaultencoding().\n\
14991errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014992
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014993static PyObject *unicode_iter(PyObject *seq);
14994
Guido van Rossumd57fd912000-03-10 22:53:23 +000014995PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014996 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014997 "str", /* tp_name */
14998 sizeof(PyUnicodeObject), /* tp_size */
14999 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015000 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015001 (destructor)unicode_dealloc, /* tp_dealloc */
15002 0, /* tp_print */
15003 0, /* tp_getattr */
15004 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015005 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015006 unicode_repr, /* tp_repr */
15007 &unicode_as_number, /* tp_as_number */
15008 &unicode_as_sequence, /* tp_as_sequence */
15009 &unicode_as_mapping, /* tp_as_mapping */
15010 (hashfunc) unicode_hash, /* tp_hash*/
15011 0, /* tp_call*/
15012 (reprfunc) unicode_str, /* tp_str */
15013 PyObject_GenericGetAttr, /* tp_getattro */
15014 0, /* tp_setattro */
15015 0, /* tp_as_buffer */
15016 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015017 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015018 unicode_doc, /* tp_doc */
15019 0, /* tp_traverse */
15020 0, /* tp_clear */
15021 PyUnicode_RichCompare, /* tp_richcompare */
15022 0, /* tp_weaklistoffset */
15023 unicode_iter, /* tp_iter */
15024 0, /* tp_iternext */
15025 unicode_methods, /* tp_methods */
15026 0, /* tp_members */
15027 0, /* tp_getset */
15028 &PyBaseObject_Type, /* tp_base */
15029 0, /* tp_dict */
15030 0, /* tp_descr_get */
15031 0, /* tp_descr_set */
15032 0, /* tp_dictoffset */
15033 0, /* tp_init */
15034 0, /* tp_alloc */
15035 unicode_new, /* tp_new */
15036 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015037};
15038
15039/* Initialize the Unicode implementation */
15040
Victor Stinner3a50e702011-10-18 21:21:00 +020015041int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015042{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015043 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015044 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015045 0x000A, /* LINE FEED */
15046 0x000D, /* CARRIAGE RETURN */
15047 0x001C, /* FILE SEPARATOR */
15048 0x001D, /* GROUP SEPARATOR */
15049 0x001E, /* RECORD SEPARATOR */
15050 0x0085, /* NEXT LINE */
15051 0x2028, /* LINE SEPARATOR */
15052 0x2029, /* PARAGRAPH SEPARATOR */
15053 };
15054
Fred Drakee4315f52000-05-09 19:53:39 +000015055 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015056 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015057 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015058 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015059 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015060
Guido van Rossumcacfc072002-05-24 19:01:59 +000015061 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015062 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015063
15064 /* initialize the linebreak bloom filter */
15065 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015066 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015067 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015068
Christian Heimes26532f72013-07-20 14:57:16 +020015069 if (PyType_Ready(&EncodingMapType) < 0)
15070 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015071
Benjamin Petersonc4311282012-10-30 23:21:10 -040015072 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15073 Py_FatalError("Can't initialize field name iterator type");
15074
15075 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15076 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015077
Victor Stinner3a50e702011-10-18 21:21:00 +020015078 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015079}
15080
15081/* Finalize the Unicode implementation */
15082
Christian Heimesa156e092008-02-16 07:38:31 +000015083int
15084PyUnicode_ClearFreeList(void)
15085{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015086 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015087}
15088
Guido van Rossumd57fd912000-03-10 22:53:23 +000015089void
Thomas Wouters78890102000-07-22 19:25:51 +000015090_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015091{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015092 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015093
Serhiy Storchaka05997252013-01-26 12:14:02 +020015094 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015095
Serhiy Storchaka05997252013-01-26 12:14:02 +020015096 for (i = 0; i < 256; i++)
15097 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015098 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015099 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015100}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015101
Walter Dörwald16807132007-05-25 13:52:07 +000015102void
15103PyUnicode_InternInPlace(PyObject **p)
15104{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015105 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015106 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015107#ifdef Py_DEBUG
15108 assert(s != NULL);
15109 assert(_PyUnicode_CHECK(s));
15110#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015111 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015112 return;
15113#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015114 /* If it's a subclass, we don't really know what putting
15115 it in the interned dict might do. */
15116 if (!PyUnicode_CheckExact(s))
15117 return;
15118 if (PyUnicode_CHECK_INTERNED(s))
15119 return;
15120 if (interned == NULL) {
15121 interned = PyDict_New();
15122 if (interned == NULL) {
15123 PyErr_Clear(); /* Don't leave an exception */
15124 return;
15125 }
15126 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015127 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015128 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015129 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015130 if (t == NULL) {
15131 PyErr_Clear();
15132 return;
15133 }
15134 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015135 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015136 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015137 return;
15138 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015139 /* The two references in interned are not counted by refcnt.
15140 The deallocator will take care of this */
15141 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015142 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015143}
15144
15145void
15146PyUnicode_InternImmortal(PyObject **p)
15147{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015148 PyUnicode_InternInPlace(p);
15149 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015150 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015151 Py_INCREF(*p);
15152 }
Walter Dörwald16807132007-05-25 13:52:07 +000015153}
15154
15155PyObject *
15156PyUnicode_InternFromString(const char *cp)
15157{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015158 PyObject *s = PyUnicode_FromString(cp);
15159 if (s == NULL)
15160 return NULL;
15161 PyUnicode_InternInPlace(&s);
15162 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015163}
15164
Alexander Belopolsky40018472011-02-26 01:02:56 +000015165void
15166_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015167{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015168 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015169 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015170 Py_ssize_t i, n;
15171 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015172
Benjamin Peterson14339b62009-01-31 16:36:08 +000015173 if (interned == NULL || !PyDict_Check(interned))
15174 return;
15175 keys = PyDict_Keys(interned);
15176 if (keys == NULL || !PyList_Check(keys)) {
15177 PyErr_Clear();
15178 return;
15179 }
Walter Dörwald16807132007-05-25 13:52:07 +000015180
Benjamin Peterson14339b62009-01-31 16:36:08 +000015181 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15182 detector, interned unicode strings are not forcibly deallocated;
15183 rather, we give them their stolen references back, and then clear
15184 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015185
Benjamin Peterson14339b62009-01-31 16:36:08 +000015186 n = PyList_GET_SIZE(keys);
15187 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015188 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015189 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015190 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015191 if (PyUnicode_READY(s) == -1) {
15192 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015193 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015194 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015195 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015196 case SSTATE_NOT_INTERNED:
15197 /* XXX Shouldn't happen */
15198 break;
15199 case SSTATE_INTERNED_IMMORTAL:
15200 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015201 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015202 break;
15203 case SSTATE_INTERNED_MORTAL:
15204 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015205 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015206 break;
15207 default:
15208 Py_FatalError("Inconsistent interned string state.");
15209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015210 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015211 }
15212 fprintf(stderr, "total size of all interned strings: "
15213 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15214 "mortal/immortal\n", mortal_size, immortal_size);
15215 Py_DECREF(keys);
15216 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015217 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015218}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015219
15220
15221/********************* Unicode Iterator **************************/
15222
15223typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015224 PyObject_HEAD
15225 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015226 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015227} unicodeiterobject;
15228
15229static void
15230unicodeiter_dealloc(unicodeiterobject *it)
15231{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015232 _PyObject_GC_UNTRACK(it);
15233 Py_XDECREF(it->it_seq);
15234 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015235}
15236
15237static int
15238unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15239{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015240 Py_VISIT(it->it_seq);
15241 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015242}
15243
15244static PyObject *
15245unicodeiter_next(unicodeiterobject *it)
15246{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015247 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015248
Benjamin Peterson14339b62009-01-31 16:36:08 +000015249 assert(it != NULL);
15250 seq = it->it_seq;
15251 if (seq == NULL)
15252 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015253 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015255 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15256 int kind = PyUnicode_KIND(seq);
15257 void *data = PyUnicode_DATA(seq);
15258 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15259 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015260 if (item != NULL)
15261 ++it->it_index;
15262 return item;
15263 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015264
Benjamin Peterson14339b62009-01-31 16:36:08 +000015265 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015266 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015267 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015268}
15269
15270static PyObject *
15271unicodeiter_len(unicodeiterobject *it)
15272{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015273 Py_ssize_t len = 0;
15274 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015275 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015276 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015277}
15278
15279PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15280
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015281static PyObject *
15282unicodeiter_reduce(unicodeiterobject *it)
15283{
15284 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015285 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015286 it->it_seq, it->it_index);
15287 } else {
15288 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15289 if (u == NULL)
15290 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015291 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015292 }
15293}
15294
15295PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15296
15297static PyObject *
15298unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15299{
15300 Py_ssize_t index = PyLong_AsSsize_t(state);
15301 if (index == -1 && PyErr_Occurred())
15302 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015303 if (it->it_seq != NULL) {
15304 if (index < 0)
15305 index = 0;
15306 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15307 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15308 it->it_index = index;
15309 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015310 Py_RETURN_NONE;
15311}
15312
15313PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15314
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015315static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015316 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015317 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015318 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15319 reduce_doc},
15320 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15321 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015322 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015323};
15324
15325PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015326 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15327 "str_iterator", /* tp_name */
15328 sizeof(unicodeiterobject), /* tp_basicsize */
15329 0, /* tp_itemsize */
15330 /* methods */
15331 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15332 0, /* tp_print */
15333 0, /* tp_getattr */
15334 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015335 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015336 0, /* tp_repr */
15337 0, /* tp_as_number */
15338 0, /* tp_as_sequence */
15339 0, /* tp_as_mapping */
15340 0, /* tp_hash */
15341 0, /* tp_call */
15342 0, /* tp_str */
15343 PyObject_GenericGetAttr, /* tp_getattro */
15344 0, /* tp_setattro */
15345 0, /* tp_as_buffer */
15346 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15347 0, /* tp_doc */
15348 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15349 0, /* tp_clear */
15350 0, /* tp_richcompare */
15351 0, /* tp_weaklistoffset */
15352 PyObject_SelfIter, /* tp_iter */
15353 (iternextfunc)unicodeiter_next, /* tp_iternext */
15354 unicodeiter_methods, /* tp_methods */
15355 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015356};
15357
15358static PyObject *
15359unicode_iter(PyObject *seq)
15360{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015361 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015362
Benjamin Peterson14339b62009-01-31 16:36:08 +000015363 if (!PyUnicode_Check(seq)) {
15364 PyErr_BadInternalCall();
15365 return NULL;
15366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015367 if (PyUnicode_READY(seq) == -1)
15368 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015369 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15370 if (it == NULL)
15371 return NULL;
15372 it->it_index = 0;
15373 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015374 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015375 _PyObject_GC_TRACK(it);
15376 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015377}
15378
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015379
15380size_t
15381Py_UNICODE_strlen(const Py_UNICODE *u)
15382{
15383 int res = 0;
15384 while(*u++)
15385 res++;
15386 return res;
15387}
15388
15389Py_UNICODE*
15390Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15391{
15392 Py_UNICODE *u = s1;
15393 while ((*u++ = *s2++));
15394 return s1;
15395}
15396
15397Py_UNICODE*
15398Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15399{
15400 Py_UNICODE *u = s1;
15401 while ((*u++ = *s2++))
15402 if (n-- == 0)
15403 break;
15404 return s1;
15405}
15406
15407Py_UNICODE*
15408Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15409{
15410 Py_UNICODE *u1 = s1;
15411 u1 += Py_UNICODE_strlen(u1);
15412 Py_UNICODE_strcpy(u1, s2);
15413 return s1;
15414}
15415
15416int
15417Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15418{
15419 while (*s1 && *s2 && *s1 == *s2)
15420 s1++, s2++;
15421 if (*s1 && *s2)
15422 return (*s1 < *s2) ? -1 : +1;
15423 if (*s1)
15424 return 1;
15425 if (*s2)
15426 return -1;
15427 return 0;
15428}
15429
15430int
15431Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15432{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015433 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015434 for (; n != 0; n--) {
15435 u1 = *s1;
15436 u2 = *s2;
15437 if (u1 != u2)
15438 return (u1 < u2) ? -1 : +1;
15439 if (u1 == '\0')
15440 return 0;
15441 s1++;
15442 s2++;
15443 }
15444 return 0;
15445}
15446
15447Py_UNICODE*
15448Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15449{
15450 const Py_UNICODE *p;
15451 for (p = s; *p; p++)
15452 if (*p == c)
15453 return (Py_UNICODE*)p;
15454 return NULL;
15455}
15456
15457Py_UNICODE*
15458Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15459{
15460 const Py_UNICODE *p;
15461 p = s + Py_UNICODE_strlen(s);
15462 while (p != s) {
15463 p--;
15464 if (*p == c)
15465 return (Py_UNICODE*)p;
15466 }
15467 return NULL;
15468}
Victor Stinner331ea922010-08-10 16:37:20 +000015469
Victor Stinner71133ff2010-09-01 23:43:53 +000015470Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015471PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015472{
Victor Stinner577db2c2011-10-11 22:12:48 +020015473 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015474 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015476 if (!PyUnicode_Check(unicode)) {
15477 PyErr_BadArgument();
15478 return NULL;
15479 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015480 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015481 if (u == NULL)
15482 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015483 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015484 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015485 PyErr_NoMemory();
15486 return NULL;
15487 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015488 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015489 size *= sizeof(Py_UNICODE);
15490 copy = PyMem_Malloc(size);
15491 if (copy == NULL) {
15492 PyErr_NoMemory();
15493 return NULL;
15494 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015495 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015496 return copy;
15497}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015498
Georg Brandl66c221e2010-10-14 07:04:07 +000015499/* A _string module, to export formatter_parser and formatter_field_name_split
15500 to the string.Formatter class implemented in Python. */
15501
15502static PyMethodDef _string_methods[] = {
15503 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15504 METH_O, PyDoc_STR("split the argument as a field name")},
15505 {"formatter_parser", (PyCFunction) formatter_parser,
15506 METH_O, PyDoc_STR("parse the argument as a format string")},
15507 {NULL, NULL}
15508};
15509
15510static struct PyModuleDef _string_module = {
15511 PyModuleDef_HEAD_INIT,
15512 "_string",
15513 PyDoc_STR("string helper module"),
15514 0,
15515 _string_methods,
15516 NULL,
15517 NULL,
15518 NULL,
15519 NULL
15520};
15521
15522PyMODINIT_FUNC
15523PyInit__string(void)
15524{
15525 return PyModule_Create(&_string_module);
15526}
15527
15528
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015529#ifdef __cplusplus
15530}
15531#endif