blob: cd8b33c5945a26a0b57640c5b2b161c4854407a0 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700207static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200321 }
322 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200323 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 }
325 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200326 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200327 }
328 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200329 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200330 }
331 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200332 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200333 }
334 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200335 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 }
337 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200338 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200339 }
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_OTHER;
341}
342
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300343/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000345Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000346PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000348#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 /* This is actually an illegal character, so it should
352 not be passed to unichr. */
353 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354#endif
355}
356
Victor Stinner910337b2011-10-03 03:20:16 +0200357#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200358int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100359_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200360{
361 PyASCIIObject *ascii;
362 unsigned int kind;
363
364 assert(PyUnicode_Check(op));
365
366 ascii = (PyASCIIObject *)op;
367 kind = ascii->state.kind;
368
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200370 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 assert(ascii->state.ready == 1);
372 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200375 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200376
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 if (ascii->state.compact == 1) {
378 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200379 assert(kind == PyUnicode_1BYTE_KIND
380 || kind == PyUnicode_2BYTE_KIND
381 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200384 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100385 }
386 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389 data = unicode->data.any;
390 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100391 assert(ascii->length == 0);
392 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 assert(ascii->state.compact == 0);
394 assert(ascii->state.ascii == 0);
395 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200397 assert(ascii->wstr != NULL);
398 assert(data == NULL);
399 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 }
401 else {
402 assert(kind == PyUnicode_1BYTE_KIND
403 || kind == PyUnicode_2BYTE_KIND
404 || kind == PyUnicode_4BYTE_KIND);
405 assert(ascii->state.compact == 0);
406 assert(ascii->state.ready == 1);
407 assert(data != NULL);
408 if (ascii->state.ascii) {
409 assert (compact->utf8 == data);
410 assert (compact->utf8_length == ascii->length);
411 }
412 else
413 assert (compact->utf8 != data);
414 }
415 }
416 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200417 if (
418#if SIZEOF_WCHAR_T == 2
419 kind == PyUnicode_2BYTE_KIND
420#else
421 kind == PyUnicode_4BYTE_KIND
422#endif
423 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 {
425 assert(ascii->wstr == data);
426 assert(compact->wstr_length == ascii->length);
427 } else
428 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200429 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200430
431 if (compact->utf8 == NULL)
432 assert(compact->utf8_length == 0);
433 if (ascii->wstr == NULL)
434 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200435 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200436 /* check that the best kind is used */
437 if (check_content && kind != PyUnicode_WCHAR_KIND)
438 {
439 Py_ssize_t i;
440 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200441 void *data;
442 Py_UCS4 ch;
443
444 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 for (i=0; i < ascii->length; i++)
446 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200447 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200448 if (ch > maxchar)
449 maxchar = ch;
450 }
451 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100452 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 255);
455 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200456 else
457 assert(maxchar < 128);
458 }
Victor Stinner77faf692011-11-20 18:56:05 +0100459 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200460 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100461 assert(maxchar <= 0xFFFF);
462 }
463 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100465 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200467 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200468 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400469 return 1;
470}
Victor Stinner910337b2011-10-03 03:20:16 +0200471#endif
472
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100473static PyObject*
474unicode_result_wchar(PyObject *unicode)
475{
476#ifndef Py_DEBUG
477 Py_ssize_t len;
478
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100479 len = _PyUnicode_WSTR_LENGTH(unicode);
480 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200482 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100483 }
484
485 if (len == 1) {
486 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100487 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489 Py_DECREF(unicode);
490 return latin1_char;
491 }
492 }
493
494 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200495 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 return NULL;
497 }
498#else
Victor Stinneraa771272012-10-04 02:32:58 +0200499 assert(Py_REFCNT(unicode) == 1);
500
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 /* don't make the result ready in debug mode to ensure that the caller
502 makes the string ready before using it */
503 assert(_PyUnicode_CheckConsistency(unicode, 1));
504#endif
505 return unicode;
506}
507
508static PyObject*
509unicode_result_ready(PyObject *unicode)
510{
511 Py_ssize_t length;
512
513 length = PyUnicode_GET_LENGTH(unicode);
514 if (length == 0) {
515 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100516 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200517 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 }
519 return unicode_empty;
520 }
521
522 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200523 void *data = PyUnicode_DATA(unicode);
524 int kind = PyUnicode_KIND(unicode);
525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 if (ch < 256) {
527 PyObject *latin1_char = unicode_latin1[ch];
528 if (latin1_char != NULL) {
529 if (unicode != latin1_char) {
530 Py_INCREF(latin1_char);
531 Py_DECREF(unicode);
532 }
533 return latin1_char;
534 }
535 else {
536 assert(_PyUnicode_CheckConsistency(unicode, 1));
537 Py_INCREF(unicode);
538 unicode_latin1[ch] = unicode;
539 return unicode;
540 }
541 }
542 }
543
544 assert(_PyUnicode_CheckConsistency(unicode, 1));
545 return unicode;
546}
547
548static PyObject*
549unicode_result(PyObject *unicode)
550{
551 assert(_PyUnicode_CHECK(unicode));
552 if (PyUnicode_IS_READY(unicode))
553 return unicode_result_ready(unicode);
554 else
555 return unicode_result_wchar(unicode);
556}
557
Victor Stinnerc4b49542011-12-11 22:44:26 +0100558static PyObject*
559unicode_result_unchanged(PyObject *unicode)
560{
561 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500562 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563 return NULL;
564 Py_INCREF(unicode);
565 return unicode;
566 }
567 else
568 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100569 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100570}
571
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573 ASCII, Latin1, UTF-8, etc. */
574static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200575backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577{
Victor Stinnerad771582015-10-09 12:38:53 +0200578 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200579 Py_UCS4 ch;
580 enum PyUnicode_Kind kind;
581 void *data;
582
583 assert(PyUnicode_IS_READY(unicode));
584 kind = PyUnicode_KIND(unicode);
585 data = PyUnicode_DATA(unicode);
586
587 size = 0;
588 /* determine replacement size */
589 for (i = collstart; i < collend; ++i) {
590 Py_ssize_t incr;
591
592 ch = PyUnicode_READ(kind, data, i);
593 if (ch < 0x100)
594 incr = 2+2;
595 else if (ch < 0x10000)
596 incr = 2+4;
597 else {
598 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200599 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200600 }
601 if (size > PY_SSIZE_T_MAX - incr) {
602 PyErr_SetString(PyExc_OverflowError,
603 "encoded result is too long for a Python string");
604 return NULL;
605 }
606 size += incr;
607 }
608
Victor Stinnerad771582015-10-09 12:38:53 +0200609 str = _PyBytesWriter_Prepare(writer, str, size);
610 if (str == NULL)
611 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200612
613 /* generate replacement */
614 for (i = collstart; i < collend; ++i) {
615 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200616 *str++ = '\\';
617 if (ch >= 0x00010000) {
618 *str++ = 'U';
619 *str++ = Py_hexdigits[(ch>>28)&0xf];
620 *str++ = Py_hexdigits[(ch>>24)&0xf];
621 *str++ = Py_hexdigits[(ch>>20)&0xf];
622 *str++ = Py_hexdigits[(ch>>16)&0xf];
623 *str++ = Py_hexdigits[(ch>>12)&0xf];
624 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 }
Victor Stinner797485e2015-10-09 03:17:30 +0200626 else if (ch >= 0x100) {
627 *str++ = 'u';
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
630 }
631 else
632 *str++ = 'x';
633 *str++ = Py_hexdigits[(ch>>4)&0xf];
634 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200635 }
636 return str;
637}
638
639/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640 ASCII, Latin1, UTF-8, etc. */
641static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200642xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644{
Victor Stinnerad771582015-10-09 12:38:53 +0200645 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 Py_UCS4 ch;
647 enum PyUnicode_Kind kind;
648 void *data;
649
650 assert(PyUnicode_IS_READY(unicode));
651 kind = PyUnicode_KIND(unicode);
652 data = PyUnicode_DATA(unicode);
653
654 size = 0;
655 /* determine replacement size */
656 for (i = collstart; i < collend; ++i) {
657 Py_ssize_t incr;
658
659 ch = PyUnicode_READ(kind, data, i);
660 if (ch < 10)
661 incr = 2+1+1;
662 else if (ch < 100)
663 incr = 2+2+1;
664 else if (ch < 1000)
665 incr = 2+3+1;
666 else if (ch < 10000)
667 incr = 2+4+1;
668 else if (ch < 100000)
669 incr = 2+5+1;
670 else if (ch < 1000000)
671 incr = 2+6+1;
672 else {
673 assert(ch <= MAX_UNICODE);
674 incr = 2+7+1;
675 }
676 if (size > PY_SSIZE_T_MAX - incr) {
677 PyErr_SetString(PyExc_OverflowError,
678 "encoded result is too long for a Python string");
679 return NULL;
680 }
681 size += incr;
682 }
683
Victor Stinnerad771582015-10-09 12:38:53 +0200684 str = _PyBytesWriter_Prepare(writer, str, size);
685 if (str == NULL)
686 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200687
688 /* generate replacement */
689 for (i = collstart; i < collend; ++i) {
690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691 }
692 return str;
693}
694
Thomas Wouters477c8d52006-05-27 19:21:47 +0000695/* --- Bloom Filters ----------------------------------------------------- */
696
697/* stuff to implement simple "bloom filters" for Unicode characters.
698 to keep things simple, we use a single bitmask, using the least 5
699 bits from each unicode characters as the bit index. */
700
701/* the linebreak mask is set up by Unicode_Init below */
702
Antoine Pitrouf068f942010-01-13 14:19:12 +0000703#if LONG_BIT >= 128
704#define BLOOM_WIDTH 128
705#elif LONG_BIT >= 64
706#define BLOOM_WIDTH 64
707#elif LONG_BIT >= 32
708#define BLOOM_WIDTH 32
709#else
710#error "LONG_BIT is smaller than 32"
711#endif
712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713#define BLOOM_MASK unsigned long
714
Serhiy Storchaka05997252013-01-26 12:14:02 +0200715static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000716
Antoine Pitrouf068f942010-01-13 14:19:12 +0000717#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718
Benjamin Peterson29060642009-01-31 22:14:21 +0000719#define BLOOM_LINEBREAK(ch) \
720 ((ch) < 128U ? ascii_linebreak[(ch)] : \
721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000722
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700723static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000725{
Victor Stinnera85af502013-04-09 21:53:54 +0200726#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
727 do { \
728 TYPE *data = (TYPE *)PTR; \
729 TYPE *end = data + LEN; \
730 Py_UCS4 ch; \
731 for (; data != end; data++) { \
732 ch = *data; \
733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734 } \
735 break; \
736 } while (0)
737
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738 /* calculate simple bloom-style bitmask for a given unicode string */
739
Antoine Pitrouf068f942010-01-13 14:19:12 +0000740 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741
742 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200743 switch (kind) {
744 case PyUnicode_1BYTE_KIND:
745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746 break;
747 case PyUnicode_2BYTE_KIND:
748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749 break;
750 case PyUnicode_4BYTE_KIND:
751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752 break;
753 default:
754 assert(0);
755 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000756 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200757
758#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759}
760
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300761static int
762ensure_unicode(PyObject *obj)
763{
764 if (!PyUnicode_Check(obj)) {
765 PyErr_Format(PyExc_TypeError,
766 "must be str, not %.100s",
767 Py_TYPE(obj)->tp_name);
768 return -1;
769 }
770 return PyUnicode_READY(obj);
771}
772
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773/* Compilation of templated routines */
774
775#include "stringlib/asciilib.h"
776#include "stringlib/fastsearch.h"
777#include "stringlib/partition.h"
778#include "stringlib/split.h"
779#include "stringlib/count.h"
780#include "stringlib/find.h"
781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs1lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300791#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs2lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs4lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200818#include "stringlib/unicodedefs.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100822#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824/* --- Unicode Object ----------------------------------------------------- */
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200827fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700829static inline Py_ssize_t
830findchar(const void *s, int kind,
831 Py_ssize_t size, Py_UCS4 ch,
832 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834 switch (kind) {
835 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200836 if ((Py_UCS1) ch != ch)
837 return -1;
838 if (direction > 0)
839 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
840 else
841 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200842 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200843 if ((Py_UCS2) ch != ch)
844 return -1;
845 if (direction > 0)
846 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
847 else
848 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200849 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200850 if (direction > 0)
851 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
852 else
853 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200854 default:
855 assert(0);
856 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858}
859
Victor Stinnerafffce42012-10-03 23:03:17 +0200860#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000861/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200862 earlier.
863
864 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
865 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
866 invalid character in Unicode 6.0. */
867static void
868unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
869{
870 int kind = PyUnicode_KIND(unicode);
871 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
872 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
873 if (length <= old_length)
874 return;
875 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
876}
877#endif
878
Victor Stinnerfe226c02011-10-03 03:52:20 +0200879static PyObject*
880resize_compact(PyObject *unicode, Py_ssize_t length)
881{
882 Py_ssize_t char_size;
883 Py_ssize_t struct_size;
884 Py_ssize_t new_size;
885 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100886 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200887#ifdef Py_DEBUG
888 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
889#endif
890
Victor Stinner79891572012-05-03 13:43:07 +0200891 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200892 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100893 assert(PyUnicode_IS_COMPACT(unicode));
894
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200895 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100896 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200897 struct_size = sizeof(PyASCIIObject);
898 else
899 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200900 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901
Victor Stinnerfe226c02011-10-03 03:52:20 +0200902 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
903 PyErr_NoMemory();
904 return NULL;
905 }
906 new_size = (struct_size + (length + 1) * char_size);
907
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200908 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
909 PyObject_DEL(_PyUnicode_UTF8(unicode));
910 _PyUnicode_UTF8(unicode) = NULL;
911 _PyUnicode_UTF8_LENGTH(unicode) = 0;
912 }
Victor Stinner84def372011-12-11 20:04:56 +0100913 _Py_DEC_REFTOTAL;
914 _Py_ForgetReference(unicode);
915
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300916 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100917 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100918 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200919 PyErr_NoMemory();
920 return NULL;
921 }
Victor Stinner84def372011-12-11 20:04:56 +0100922 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200923 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100924
Victor Stinnerfe226c02011-10-03 03:52:20 +0200925 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200926 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200927 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100928 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200929 _PyUnicode_WSTR_LENGTH(unicode) = length;
930 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100931 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
932 PyObject_DEL(_PyUnicode_WSTR(unicode));
933 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100934 if (!PyUnicode_IS_ASCII(unicode))
935 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100936 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200937#ifdef Py_DEBUG
938 unicode_fill_invalid(unicode, old_length);
939#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
941 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200942 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 return unicode;
944}
945
Alexander Belopolsky40018472011-02-26 01:02:56 +0000946static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200947resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948{
Victor Stinner95663112011-10-04 01:03:50 +0200949 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100950 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200952 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000953
Victor Stinnerfe226c02011-10-03 03:52:20 +0200954 if (PyUnicode_IS_READY(unicode)) {
955 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200956 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200958#ifdef Py_DEBUG
959 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
960#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961
962 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200963 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
965 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200966
967 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
968 PyErr_NoMemory();
969 return -1;
970 }
971 new_size = (length + 1) * char_size;
972
Victor Stinner7a9105a2011-12-12 00:13:42 +0100973 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
974 {
975 PyObject_DEL(_PyUnicode_UTF8(unicode));
976 _PyUnicode_UTF8(unicode) = NULL;
977 _PyUnicode_UTF8_LENGTH(unicode) = 0;
978 }
979
Victor Stinnerfe226c02011-10-03 03:52:20 +0200980 data = (PyObject *)PyObject_REALLOC(data, new_size);
981 if (data == NULL) {
982 PyErr_NoMemory();
983 return -1;
984 }
985 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200986 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200987 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200988 _PyUnicode_WSTR_LENGTH(unicode) = length;
989 }
990 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200991 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200992 _PyUnicode_UTF8_LENGTH(unicode) = length;
993 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 _PyUnicode_LENGTH(unicode) = length;
995 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 unicode_fill_invalid(unicode, old_length);
998#endif
Victor Stinner95663112011-10-04 01:03:50 +0200999 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001000 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 }
Victor Stinner95663112011-10-04 01:03:50 +02001004 assert(_PyUnicode_WSTR(unicode) != NULL);
1005
1006 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001007 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001008 PyErr_NoMemory();
1009 return -1;
1010 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001012 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001013 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001014 if (!wstr) {
1015 PyErr_NoMemory();
1016 return -1;
1017 }
1018 _PyUnicode_WSTR(unicode) = wstr;
1019 _PyUnicode_WSTR(unicode)[length] = 0;
1020 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001021 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022 return 0;
1023}
1024
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025static PyObject*
1026resize_copy(PyObject *unicode, Py_ssize_t length)
1027{
1028 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001031
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001032 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001033
1034 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1035 if (copy == NULL)
1036 return NULL;
1037
1038 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001039 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001041 }
1042 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001043 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001044
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001045 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 if (w == NULL)
1047 return NULL;
1048 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1049 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001050 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001051 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001052 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 }
1054}
1055
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001057 Ux0000 terminated; some code (e.g. new_identifier)
1058 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059
1060 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001061 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001062
1063*/
1064
Alexander Belopolsky40018472011-02-26 01:02:56 +00001065static PyUnicodeObject *
1066_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001068 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001070
Thomas Wouters477c8d52006-05-27 19:21:47 +00001071 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072 if (length == 0 && unicode_empty != NULL) {
1073 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001074 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075 }
1076
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001077 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001078 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001079 return (PyUnicodeObject *)PyErr_NoMemory();
1080 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001081 if (length < 0) {
1082 PyErr_SetString(PyExc_SystemError,
1083 "Negative size passed to _PyUnicode_New");
1084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001085 }
1086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001087 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1088 if (unicode == NULL)
1089 return NULL;
1090 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001091
1092 _PyUnicode_WSTR_LENGTH(unicode) = length;
1093 _PyUnicode_HASH(unicode) = -1;
1094 _PyUnicode_STATE(unicode).interned = 0;
1095 _PyUnicode_STATE(unicode).kind = 0;
1096 _PyUnicode_STATE(unicode).compact = 0;
1097 _PyUnicode_STATE(unicode).ready = 0;
1098 _PyUnicode_STATE(unicode).ascii = 0;
1099 _PyUnicode_DATA_ANY(unicode) = NULL;
1100 _PyUnicode_LENGTH(unicode) = 0;
1101 _PyUnicode_UTF8(unicode) = NULL;
1102 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1105 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001106 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001107 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001108 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001109 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110
Jeremy Hyltond8082792003-09-16 19:41:39 +00001111 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001112 * the caller fails before initializing str -- unicode_resize()
1113 * reads str[0], and the Keep-Alive optimization can keep memory
1114 * allocated for str alive across a call to unicode_dealloc(unicode).
1115 * We don't want unicode_resize to read uninitialized memory in
1116 * that case.
1117 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 _PyUnicode_WSTR(unicode)[0] = 0;
1119 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001120
Victor Stinner7931d9a2011-11-04 00:22:48 +01001121 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001122 return unicode;
1123}
1124
Victor Stinnerf42dc442011-10-02 23:33:16 +02001125static const char*
1126unicode_kind_name(PyObject *unicode)
1127{
Victor Stinner42dfd712011-10-03 14:41:45 +02001128 /* don't check consistency: unicode_kind_name() is called from
1129 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001130 if (!PyUnicode_IS_COMPACT(unicode))
1131 {
1132 if (!PyUnicode_IS_READY(unicode))
1133 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001134 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001135 {
1136 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001137 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001138 return "legacy ascii";
1139 else
1140 return "legacy latin1";
1141 case PyUnicode_2BYTE_KIND:
1142 return "legacy UCS2";
1143 case PyUnicode_4BYTE_KIND:
1144 return "legacy UCS4";
1145 default:
1146 return "<legacy invalid kind>";
1147 }
1148 }
1149 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001150 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001152 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001153 return "ascii";
1154 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001155 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001156 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001157 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001158 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001159 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001160 default:
1161 return "<invalid compact kind>";
1162 }
1163}
1164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166/* Functions wrapping macros for use in debugger */
1167char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001168 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169}
1170
1171void *_PyUnicode_compact_data(void *unicode) {
1172 return _PyUnicode_COMPACT_DATA(unicode);
1173}
1174void *_PyUnicode_data(void *unicode){
1175 printf("obj %p\n", unicode);
1176 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1177 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1178 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1179 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1180 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1181 return PyUnicode_DATA(unicode);
1182}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001183
1184void
1185_PyUnicode_Dump(PyObject *op)
1186{
1187 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001188 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1189 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1190 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001191
Victor Stinnera849a4b2011-10-03 12:12:11 +02001192 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001193 {
1194 if (ascii->state.ascii)
1195 data = (ascii + 1);
1196 else
1197 data = (compact + 1);
1198 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001199 else
1200 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001201 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1202 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001203
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 if (ascii->wstr == data)
1205 printf("shared ");
1206 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001207
Victor Stinnera3b334d2011-10-03 13:53:37 +02001208 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001209 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001210 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1211 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001212 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1213 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001214 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001216}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217#endif
1218
1219PyObject *
1220PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1221{
1222 PyObject *obj;
1223 PyCompactUnicodeObject *unicode;
1224 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001225 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001226 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001227 Py_ssize_t char_size;
1228 Py_ssize_t struct_size;
1229
1230 /* Optimization for empty strings */
1231 if (size == 0 && unicode_empty != NULL) {
1232 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001233 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234 }
1235
Victor Stinner9e9d6892011-10-04 01:02:02 +02001236 is_ascii = 0;
1237 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 struct_size = sizeof(PyCompactUnicodeObject);
1239 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001240 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 char_size = 1;
1242 is_ascii = 1;
1243 struct_size = sizeof(PyASCIIObject);
1244 }
1245 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001246 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247 char_size = 1;
1248 }
1249 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001250 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 char_size = 2;
1252 if (sizeof(wchar_t) == 2)
1253 is_sharing = 1;
1254 }
1255 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001256 if (maxchar > MAX_UNICODE) {
1257 PyErr_SetString(PyExc_SystemError,
1258 "invalid maximum character passed to PyUnicode_New");
1259 return NULL;
1260 }
Victor Stinner8f825062012-04-27 13:55:39 +02001261 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262 char_size = 4;
1263 if (sizeof(wchar_t) == 4)
1264 is_sharing = 1;
1265 }
1266
1267 /* Ensure we won't overflow the size. */
1268 if (size < 0) {
1269 PyErr_SetString(PyExc_SystemError,
1270 "Negative size passed to PyUnicode_New");
1271 return NULL;
1272 }
1273 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1274 return PyErr_NoMemory();
1275
1276 /* Duplicated allocation code from _PyObject_New() instead of a call to
1277 * PyObject_New() so we are able to allocate space for the object and
1278 * it's data buffer.
1279 */
1280 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1281 if (obj == NULL)
1282 return PyErr_NoMemory();
1283 obj = PyObject_INIT(obj, &PyUnicode_Type);
1284 if (obj == NULL)
1285 return NULL;
1286
1287 unicode = (PyCompactUnicodeObject *)obj;
1288 if (is_ascii)
1289 data = ((PyASCIIObject*)obj) + 1;
1290 else
1291 data = unicode + 1;
1292 _PyUnicode_LENGTH(unicode) = size;
1293 _PyUnicode_HASH(unicode) = -1;
1294 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001295 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296 _PyUnicode_STATE(unicode).compact = 1;
1297 _PyUnicode_STATE(unicode).ready = 1;
1298 _PyUnicode_STATE(unicode).ascii = is_ascii;
1299 if (is_ascii) {
1300 ((char*)data)[size] = 0;
1301 _PyUnicode_WSTR(unicode) = NULL;
1302 }
Victor Stinner8f825062012-04-27 13:55:39 +02001303 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304 ((char*)data)[size] = 0;
1305 _PyUnicode_WSTR(unicode) = NULL;
1306 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001308 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 else {
1311 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001312 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001313 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001315 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 ((Py_UCS4*)data)[size] = 0;
1317 if (is_sharing) {
1318 _PyUnicode_WSTR_LENGTH(unicode) = size;
1319 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1320 }
1321 else {
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1323 _PyUnicode_WSTR(unicode) = NULL;
1324 }
1325 }
Victor Stinner8f825062012-04-27 13:55:39 +02001326#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001327 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001328#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001329 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 return obj;
1331}
1332
1333#if SIZEOF_WCHAR_T == 2
1334/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1335 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001336 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337
1338 This function assumes that unicode can hold one more code point than wstr
1339 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001340static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001342 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343{
1344 const wchar_t *iter;
1345 Py_UCS4 *ucs4_out;
1346
Victor Stinner910337b2011-10-03 03:20:16 +02001347 assert(unicode != NULL);
1348 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1350 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1351
1352 for (iter = begin; iter < end; ) {
1353 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1354 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001355 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1356 && (iter+1) < end
1357 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358 {
Victor Stinner551ac952011-11-29 22:58:13 +01001359 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360 iter += 2;
1361 }
1362 else {
1363 *ucs4_out++ = *iter;
1364 iter++;
1365 }
1366 }
1367 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1368 _PyUnicode_GET_LENGTH(unicode)));
1369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370}
1371#endif
1372
Victor Stinnercd9950f2011-10-02 00:34:53 +02001373static int
Victor Stinner488fa492011-12-12 00:01:39 +01001374unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001375{
Victor Stinner488fa492011-12-12 00:01:39 +01001376 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001377 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001378 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001379 return -1;
1380 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001381 return 0;
1382}
1383
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001384static int
1385_copy_characters(PyObject *to, Py_ssize_t to_start,
1386 PyObject *from, Py_ssize_t from_start,
1387 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001389 unsigned int from_kind, to_kind;
1390 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391
Victor Stinneree4544c2012-05-09 22:24:08 +02001392 assert(0 <= how_many);
1393 assert(0 <= from_start);
1394 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001395 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001397 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398
Victor Stinnerd3f08822012-05-29 12:57:52 +02001399 assert(PyUnicode_Check(to));
1400 assert(PyUnicode_IS_READY(to));
1401 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1402
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001403 if (how_many == 0)
1404 return 0;
1405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001407 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001409 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410
Victor Stinnerf1852262012-06-16 16:38:26 +02001411#ifdef Py_DEBUG
1412 if (!check_maxchar
1413 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1414 {
1415 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1416 Py_UCS4 ch;
1417 Py_ssize_t i;
1418 for (i=0; i < how_many; i++) {
1419 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1420 assert(ch <= to_maxchar);
1421 }
1422 }
1423#endif
1424
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001425 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001426 if (check_maxchar
1427 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1428 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001429 /* Writing Latin-1 characters into an ASCII string requires to
1430 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001431 Py_UCS4 max_char;
1432 max_char = ucs1lib_find_max_char(from_data,
1433 (Py_UCS1*)from_data + how_many);
1434 if (max_char >= 128)
1435 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001436 }
Christian Heimesf051e432016-09-13 20:22:02 +02001437 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001438 (char*)from_data + from_kind * from_start,
1439 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001441 else if (from_kind == PyUnicode_1BYTE_KIND
1442 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001443 {
1444 _PyUnicode_CONVERT_BYTES(
1445 Py_UCS1, Py_UCS2,
1446 PyUnicode_1BYTE_DATA(from) + from_start,
1447 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1448 PyUnicode_2BYTE_DATA(to) + to_start
1449 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001450 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001451 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001452 && to_kind == PyUnicode_4BYTE_KIND)
1453 {
1454 _PyUnicode_CONVERT_BYTES(
1455 Py_UCS1, Py_UCS4,
1456 PyUnicode_1BYTE_DATA(from) + from_start,
1457 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1458 PyUnicode_4BYTE_DATA(to) + to_start
1459 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001460 }
1461 else if (from_kind == PyUnicode_2BYTE_KIND
1462 && to_kind == PyUnicode_4BYTE_KIND)
1463 {
1464 _PyUnicode_CONVERT_BYTES(
1465 Py_UCS2, Py_UCS4,
1466 PyUnicode_2BYTE_DATA(from) + from_start,
1467 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1468 PyUnicode_4BYTE_DATA(to) + to_start
1469 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001470 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001471 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001472 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1473
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001474 if (!check_maxchar) {
1475 if (from_kind == PyUnicode_2BYTE_KIND
1476 && to_kind == PyUnicode_1BYTE_KIND)
1477 {
1478 _PyUnicode_CONVERT_BYTES(
1479 Py_UCS2, Py_UCS1,
1480 PyUnicode_2BYTE_DATA(from) + from_start,
1481 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1482 PyUnicode_1BYTE_DATA(to) + to_start
1483 );
1484 }
1485 else if (from_kind == PyUnicode_4BYTE_KIND
1486 && to_kind == PyUnicode_1BYTE_KIND)
1487 {
1488 _PyUnicode_CONVERT_BYTES(
1489 Py_UCS4, Py_UCS1,
1490 PyUnicode_4BYTE_DATA(from) + from_start,
1491 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1492 PyUnicode_1BYTE_DATA(to) + to_start
1493 );
1494 }
1495 else if (from_kind == PyUnicode_4BYTE_KIND
1496 && to_kind == PyUnicode_2BYTE_KIND)
1497 {
1498 _PyUnicode_CONVERT_BYTES(
1499 Py_UCS4, Py_UCS2,
1500 PyUnicode_4BYTE_DATA(from) + from_start,
1501 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1502 PyUnicode_2BYTE_DATA(to) + to_start
1503 );
1504 }
1505 else {
1506 assert(0);
1507 return -1;
1508 }
1509 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001510 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001511 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001512 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001513 Py_ssize_t i;
1514
Victor Stinnera0702ab2011-09-29 14:14:38 +02001515 for (i=0; i < how_many; i++) {
1516 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001517 if (ch > to_maxchar)
1518 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001519 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1520 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001521 }
1522 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001523 return 0;
1524}
1525
Victor Stinnerd3f08822012-05-29 12:57:52 +02001526void
1527_PyUnicode_FastCopyCharacters(
1528 PyObject *to, Py_ssize_t to_start,
1529 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001530{
1531 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1532}
1533
1534Py_ssize_t
1535PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1536 PyObject *from, Py_ssize_t from_start,
1537 Py_ssize_t how_many)
1538{
1539 int err;
1540
1541 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1542 PyErr_BadInternalCall();
1543 return -1;
1544 }
1545
Benjamin Petersonbac79492012-01-14 13:34:47 -05001546 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001547 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001548 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001549 return -1;
1550
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001551 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001552 PyErr_SetString(PyExc_IndexError, "string index out of range");
1553 return -1;
1554 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001555 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001556 PyErr_SetString(PyExc_IndexError, "string index out of range");
1557 return -1;
1558 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001559 if (how_many < 0) {
1560 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1561 return -1;
1562 }
1563 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001564 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1565 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001566 "Cannot write %zi characters at %zi "
1567 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001568 how_many, to_start, PyUnicode_GET_LENGTH(to));
1569 return -1;
1570 }
1571
1572 if (how_many == 0)
1573 return 0;
1574
Victor Stinner488fa492011-12-12 00:01:39 +01001575 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001576 return -1;
1577
1578 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1579 if (err) {
1580 PyErr_Format(PyExc_SystemError,
1581 "Cannot copy %s characters "
1582 "into a string of %s characters",
1583 unicode_kind_name(from),
1584 unicode_kind_name(to));
1585 return -1;
1586 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001587 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001588}
1589
Victor Stinner17222162011-09-28 22:15:37 +02001590/* Find the maximum code point and count the number of surrogate pairs so a
1591 correct string length can be computed before converting a string to UCS4.
1592 This function counts single surrogates as a character and not as a pair.
1593
1594 Return 0 on success, or -1 on error. */
1595static int
1596find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1597 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001598{
1599 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001600 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001601
Victor Stinnerc53be962011-10-02 21:33:54 +02001602 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603 *num_surrogates = 0;
1604 *maxchar = 0;
1605
1606 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001607#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001608 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1609 && (iter+1) < end
1610 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1611 {
1612 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1613 ++(*num_surrogates);
1614 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001615 }
1616 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001617#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001618 {
1619 ch = *iter;
1620 iter++;
1621 }
1622 if (ch > *maxchar) {
1623 *maxchar = ch;
1624 if (*maxchar > MAX_UNICODE) {
1625 PyErr_Format(PyExc_ValueError,
1626 "character U+%x is not in range [U+0000; U+10ffff]",
1627 ch);
1628 return -1;
1629 }
1630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 }
1632 return 0;
1633}
1634
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001635int
1636_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001637{
1638 wchar_t *end;
1639 Py_UCS4 maxchar = 0;
1640 Py_ssize_t num_surrogates;
1641#if SIZEOF_WCHAR_T == 2
1642 Py_ssize_t length_wo_surrogates;
1643#endif
1644
Georg Brandl7597add2011-10-05 16:36:47 +02001645 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001646 strings were created using _PyObject_New() and where no canonical
1647 representation (the str field) has been set yet aka strings
1648 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001649 assert(_PyUnicode_CHECK(unicode));
1650 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001651 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001652 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001653 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001654 /* Actually, it should neither be interned nor be anything else: */
1655 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001658 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001659 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661
1662 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001663 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1664 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001665 PyErr_NoMemory();
1666 return -1;
1667 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001668 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001669 _PyUnicode_WSTR(unicode), end,
1670 PyUnicode_1BYTE_DATA(unicode));
1671 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1672 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1673 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1674 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001675 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001676 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001677 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001678 }
1679 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001680 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001681 _PyUnicode_UTF8(unicode) = NULL;
1682 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 }
1684 PyObject_FREE(_PyUnicode_WSTR(unicode));
1685 _PyUnicode_WSTR(unicode) = NULL;
1686 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1687 }
1688 /* In this case we might have to convert down from 4-byte native
1689 wchar_t to 2-byte unicode. */
1690 else if (maxchar < 65536) {
1691 assert(num_surrogates == 0 &&
1692 "FindMaxCharAndNumSurrogatePairs() messed up");
1693
Victor Stinner506f5922011-09-28 22:34:18 +02001694#if SIZEOF_WCHAR_T == 2
1695 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001696 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001697 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1698 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1699 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001700 _PyUnicode_UTF8(unicode) = NULL;
1701 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001702#else
1703 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001704 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001705 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001706 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001707 PyErr_NoMemory();
1708 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 }
Victor Stinner506f5922011-09-28 22:34:18 +02001710 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1711 _PyUnicode_WSTR(unicode), end,
1712 PyUnicode_2BYTE_DATA(unicode));
1713 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1714 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1715 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001716 _PyUnicode_UTF8(unicode) = NULL;
1717 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001718 PyObject_FREE(_PyUnicode_WSTR(unicode));
1719 _PyUnicode_WSTR(unicode) = NULL;
1720 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1721#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001722 }
1723 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1724 else {
1725#if SIZEOF_WCHAR_T == 2
1726 /* in case the native representation is 2-bytes, we need to allocate a
1727 new normalized 4-byte version. */
1728 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001729 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1730 PyErr_NoMemory();
1731 return -1;
1732 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001733 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1734 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 PyErr_NoMemory();
1736 return -1;
1737 }
1738 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1739 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001740 _PyUnicode_UTF8(unicode) = NULL;
1741 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001742 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1743 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001744 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 PyObject_FREE(_PyUnicode_WSTR(unicode));
1746 _PyUnicode_WSTR(unicode) = NULL;
1747 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1748#else
1749 assert(num_surrogates == 0);
1750
Victor Stinnerc3c74152011-10-02 20:39:55 +02001751 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001753 _PyUnicode_UTF8(unicode) = NULL;
1754 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1756#endif
1757 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1758 }
1759 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001760 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 return 0;
1762}
1763
Alexander Belopolsky40018472011-02-26 01:02:56 +00001764static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001765unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766{
Walter Dörwald16807132007-05-25 13:52:07 +00001767 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001768 case SSTATE_NOT_INTERNED:
1769 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001770
Benjamin Peterson29060642009-01-31 22:14:21 +00001771 case SSTATE_INTERNED_MORTAL:
1772 /* revive dead object temporarily for DelItem */
1773 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001774 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001775 Py_FatalError(
1776 "deletion of interned string failed");
1777 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001778
Benjamin Peterson29060642009-01-31 22:14:21 +00001779 case SSTATE_INTERNED_IMMORTAL:
1780 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001781
Benjamin Peterson29060642009-01-31 22:14:21 +00001782 default:
1783 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001784 }
1785
Victor Stinner03490912011-10-03 23:45:12 +02001786 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001788 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001789 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001790 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1791 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001793 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794}
1795
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001796#ifdef Py_DEBUG
1797static int
1798unicode_is_singleton(PyObject *unicode)
1799{
1800 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1801 if (unicode == unicode_empty)
1802 return 1;
1803 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1804 {
1805 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1806 if (ch < 256 && unicode_latin1[ch] == unicode)
1807 return 1;
1808 }
1809 return 0;
1810}
1811#endif
1812
Alexander Belopolsky40018472011-02-26 01:02:56 +00001813static int
Victor Stinner488fa492011-12-12 00:01:39 +01001814unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001815{
Victor Stinner488fa492011-12-12 00:01:39 +01001816 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001817 if (Py_REFCNT(unicode) != 1)
1818 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001819 if (_PyUnicode_HASH(unicode) != -1)
1820 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001821 if (PyUnicode_CHECK_INTERNED(unicode))
1822 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001823 if (!PyUnicode_CheckExact(unicode))
1824 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001825#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001826 /* singleton refcount is greater than 1 */
1827 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001828#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001829 return 1;
1830}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001831
Victor Stinnerfe226c02011-10-03 03:52:20 +02001832static int
1833unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1834{
1835 PyObject *unicode;
1836 Py_ssize_t old_length;
1837
1838 assert(p_unicode != NULL);
1839 unicode = *p_unicode;
1840
1841 assert(unicode != NULL);
1842 assert(PyUnicode_Check(unicode));
1843 assert(0 <= length);
1844
Victor Stinner910337b2011-10-03 03:20:16 +02001845 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001846 old_length = PyUnicode_WSTR_LENGTH(unicode);
1847 else
1848 old_length = PyUnicode_GET_LENGTH(unicode);
1849 if (old_length == length)
1850 return 0;
1851
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001852 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001853 _Py_INCREF_UNICODE_EMPTY();
1854 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001855 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001856 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001857 return 0;
1858 }
1859
Victor Stinner488fa492011-12-12 00:01:39 +01001860 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001861 PyObject *copy = resize_copy(unicode, length);
1862 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001863 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001864 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001865 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001866 }
1867
Victor Stinnerfe226c02011-10-03 03:52:20 +02001868 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001869 PyObject *new_unicode = resize_compact(unicode, length);
1870 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001871 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001872 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001873 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001874 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001875 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001876}
1877
Alexander Belopolsky40018472011-02-26 01:02:56 +00001878int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001879PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001880{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001881 PyObject *unicode;
1882 if (p_unicode == NULL) {
1883 PyErr_BadInternalCall();
1884 return -1;
1885 }
1886 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001887 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001888 {
1889 PyErr_BadInternalCall();
1890 return -1;
1891 }
1892 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001893}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001894
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001895/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001896
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001897 WARNING: The function doesn't copy the terminating null character and
1898 doesn't check the maximum character (may write a latin1 character in an
1899 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001900static void
1901unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1902 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001903{
1904 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1905 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001906 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001907
1908 switch (kind) {
1909 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001910 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001911#ifdef Py_DEBUG
1912 if (PyUnicode_IS_ASCII(unicode)) {
1913 Py_UCS4 maxchar = ucs1lib_find_max_char(
1914 (const Py_UCS1*)str,
1915 (const Py_UCS1*)str + len);
1916 assert(maxchar < 128);
1917 }
1918#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001919 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001920 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001921 }
1922 case PyUnicode_2BYTE_KIND: {
1923 Py_UCS2 *start = (Py_UCS2 *)data + index;
1924 Py_UCS2 *ucs2 = start;
1925 assert(index <= PyUnicode_GET_LENGTH(unicode));
1926
Victor Stinner184252a2012-06-16 02:57:41 +02001927 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001928 *ucs2 = (Py_UCS2)*str;
1929
1930 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001931 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001932 }
1933 default: {
1934 Py_UCS4 *start = (Py_UCS4 *)data + index;
1935 Py_UCS4 *ucs4 = start;
1936 assert(kind == PyUnicode_4BYTE_KIND);
1937 assert(index <= PyUnicode_GET_LENGTH(unicode));
1938
Victor Stinner184252a2012-06-16 02:57:41 +02001939 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001940 *ucs4 = (Py_UCS4)*str;
1941
1942 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001943 }
1944 }
1945}
1946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947static PyObject*
1948get_latin1_char(unsigned char ch)
1949{
Victor Stinnera464fc12011-10-02 20:39:30 +02001950 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001951 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001952 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953 if (!unicode)
1954 return NULL;
1955 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001956 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957 unicode_latin1[ch] = unicode;
1958 }
1959 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001960 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961}
1962
Victor Stinner985a82a2014-01-03 12:53:47 +01001963static PyObject*
1964unicode_char(Py_UCS4 ch)
1965{
1966 PyObject *unicode;
1967
1968 assert(ch <= MAX_UNICODE);
1969
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001970 if (ch < 256)
1971 return get_latin1_char(ch);
1972
Victor Stinner985a82a2014-01-03 12:53:47 +01001973 unicode = PyUnicode_New(1, ch);
1974 if (unicode == NULL)
1975 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001976
1977 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1978 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001979 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001980 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001981 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1982 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1983 }
1984 assert(_PyUnicode_CheckConsistency(unicode, 1));
1985 return unicode;
1986}
1987
Alexander Belopolsky40018472011-02-26 01:02:56 +00001988PyObject *
1989PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001991 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 Py_UCS4 maxchar = 0;
1993 Py_ssize_t num_surrogates;
1994
1995 if (u == NULL)
1996 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001998 /* If the Unicode data is known at construction time, we can apply
1999 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002002 if (size == 0)
2003 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 /* Single character Unicode objects in the Latin-1 range are
2006 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002007 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 return get_latin1_char((unsigned char)*u);
2009
2010 /* If not empty and not single character, copy the Unicode data
2011 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002012 if (find_maxchar_surrogates(u, u + size,
2013 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 return NULL;
2015
Victor Stinner8faf8212011-12-08 22:14:11 +01002016 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017 if (!unicode)
2018 return NULL;
2019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002020 switch (PyUnicode_KIND(unicode)) {
2021 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002022 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002023 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2024 break;
2025 case PyUnicode_2BYTE_KIND:
2026#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002027 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002029 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2031#endif
2032 break;
2033 case PyUnicode_4BYTE_KIND:
2034#if SIZEOF_WCHAR_T == 2
2035 /* This is the only case which has to process surrogates, thus
2036 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002037 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002038#else
2039 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002040 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041#endif
2042 break;
2043 default:
2044 assert(0 && "Impossible state");
2045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002047 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048}
2049
Alexander Belopolsky40018472011-02-26 01:02:56 +00002050PyObject *
2051PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002052{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002053 if (size < 0) {
2054 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002055 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002056 return NULL;
2057 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002058 if (u != NULL)
2059 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2060 else
2061 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002062}
2063
Alexander Belopolsky40018472011-02-26 01:02:56 +00002064PyObject *
2065PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002066{
2067 size_t size = strlen(u);
2068 if (size > PY_SSIZE_T_MAX) {
2069 PyErr_SetString(PyExc_OverflowError, "input too long");
2070 return NULL;
2071 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002072 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002073}
2074
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002075PyObject *
2076_PyUnicode_FromId(_Py_Identifier *id)
2077{
2078 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002079 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2080 strlen(id->string),
2081 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002082 if (!id->object)
2083 return NULL;
2084 PyUnicode_InternInPlace(&id->object);
2085 assert(!id->next);
2086 id->next = static_strings;
2087 static_strings = id;
2088 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002089 return id->object;
2090}
2091
2092void
2093_PyUnicode_ClearStaticStrings()
2094{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002095 _Py_Identifier *tmp, *s = static_strings;
2096 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002097 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002098 tmp = s->next;
2099 s->next = NULL;
2100 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002101 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002102 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002103}
2104
Benjamin Peterson0df54292012-03-26 14:50:32 -04002105/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002106
Victor Stinnerd3f08822012-05-29 12:57:52 +02002107PyObject*
2108_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002109{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002110 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002111 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002112 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002113#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002114 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002115#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002116 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002117 }
Victor Stinner785938e2011-12-11 20:09:03 +01002118 unicode = PyUnicode_New(size, 127);
2119 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002120 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002121 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2122 assert(_PyUnicode_CheckConsistency(unicode, 1));
2123 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002124}
2125
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002126static Py_UCS4
2127kind_maxchar_limit(unsigned int kind)
2128{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002129 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002130 case PyUnicode_1BYTE_KIND:
2131 return 0x80;
2132 case PyUnicode_2BYTE_KIND:
2133 return 0x100;
2134 case PyUnicode_4BYTE_KIND:
2135 return 0x10000;
2136 default:
2137 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002138 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002139 }
2140}
2141
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002142static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002143align_maxchar(Py_UCS4 maxchar)
2144{
2145 if (maxchar <= 127)
2146 return 127;
2147 else if (maxchar <= 255)
2148 return 255;
2149 else if (maxchar <= 65535)
2150 return 65535;
2151 else
2152 return MAX_UNICODE;
2153}
2154
Victor Stinner702c7342011-10-05 13:50:52 +02002155static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002156_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002157{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002158 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002159 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002160
Serhiy Storchaka678db842013-01-26 12:16:36 +02002161 if (size == 0)
2162 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002163 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002164 if (size == 1)
2165 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002166
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002167 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002168 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 if (!res)
2170 return NULL;
2171 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002172 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002174}
2175
Victor Stinnere57b1c02011-09-28 22:20:48 +02002176static PyObject*
2177_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178{
2179 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002180 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002181
Serhiy Storchaka678db842013-01-26 12:16:36 +02002182 if (size == 0)
2183 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002184 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002185 if (size == 1)
2186 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002187
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002188 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002189 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190 if (!res)
2191 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002192 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002194 else {
2195 _PyUnicode_CONVERT_BYTES(
2196 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2197 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002198 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199 return res;
2200}
2201
Victor Stinnere57b1c02011-09-28 22:20:48 +02002202static PyObject*
2203_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204{
2205 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002206 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002207
Serhiy Storchaka678db842013-01-26 12:16:36 +02002208 if (size == 0)
2209 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002210 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002211 if (size == 1)
2212 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002213
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002214 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002215 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 if (!res)
2217 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002218 if (max_char < 256)
2219 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2220 PyUnicode_1BYTE_DATA(res));
2221 else if (max_char < 0x10000)
2222 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2223 PyUnicode_2BYTE_DATA(res));
2224 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002226 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 return res;
2228}
2229
2230PyObject*
2231PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2232{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002233 if (size < 0) {
2234 PyErr_SetString(PyExc_ValueError, "size must be positive");
2235 return NULL;
2236 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002237 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002239 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002241 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002243 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002244 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002245 PyErr_SetString(PyExc_SystemError, "invalid kind");
2246 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248}
2249
Victor Stinnerece58de2012-04-23 23:36:38 +02002250Py_UCS4
2251_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2252{
2253 enum PyUnicode_Kind kind;
2254 void *startptr, *endptr;
2255
2256 assert(PyUnicode_IS_READY(unicode));
2257 assert(0 <= start);
2258 assert(end <= PyUnicode_GET_LENGTH(unicode));
2259 assert(start <= end);
2260
2261 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2262 return PyUnicode_MAX_CHAR_VALUE(unicode);
2263
2264 if (start == end)
2265 return 127;
2266
Victor Stinner94d558b2012-04-27 22:26:58 +02002267 if (PyUnicode_IS_ASCII(unicode))
2268 return 127;
2269
Victor Stinnerece58de2012-04-23 23:36:38 +02002270 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002271 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002272 endptr = (char *)startptr + end * kind;
2273 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002274 switch(kind) {
2275 case PyUnicode_1BYTE_KIND:
2276 return ucs1lib_find_max_char(startptr, endptr);
2277 case PyUnicode_2BYTE_KIND:
2278 return ucs2lib_find_max_char(startptr, endptr);
2279 case PyUnicode_4BYTE_KIND:
2280 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002281 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002282 assert(0);
2283 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002284 }
2285}
2286
Victor Stinner25a4b292011-10-06 12:31:55 +02002287/* Ensure that a string uses the most efficient storage, if it is not the
2288 case: create a new string with of the right kind. Write NULL into *p_unicode
2289 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002290static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002291unicode_adjust_maxchar(PyObject **p_unicode)
2292{
2293 PyObject *unicode, *copy;
2294 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002295 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002296 unsigned int kind;
2297
2298 assert(p_unicode != NULL);
2299 unicode = *p_unicode;
2300 assert(PyUnicode_IS_READY(unicode));
2301 if (PyUnicode_IS_ASCII(unicode))
2302 return;
2303
2304 len = PyUnicode_GET_LENGTH(unicode);
2305 kind = PyUnicode_KIND(unicode);
2306 if (kind == PyUnicode_1BYTE_KIND) {
2307 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002308 max_char = ucs1lib_find_max_char(u, u + len);
2309 if (max_char >= 128)
2310 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002311 }
2312 else if (kind == PyUnicode_2BYTE_KIND) {
2313 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002314 max_char = ucs2lib_find_max_char(u, u + len);
2315 if (max_char >= 256)
2316 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002317 }
2318 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002319 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002320 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002321 max_char = ucs4lib_find_max_char(u, u + len);
2322 if (max_char >= 0x10000)
2323 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002324 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002325 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002326 if (copy != NULL)
2327 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002328 Py_DECREF(unicode);
2329 *p_unicode = copy;
2330}
2331
Victor Stinner034f6cf2011-09-30 02:26:44 +02002332PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002333_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002334{
Victor Stinner87af4f22011-11-21 23:03:47 +01002335 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002336 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002337
Victor Stinner034f6cf2011-09-30 02:26:44 +02002338 if (!PyUnicode_Check(unicode)) {
2339 PyErr_BadInternalCall();
2340 return NULL;
2341 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002342 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002343 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002344
Victor Stinner87af4f22011-11-21 23:03:47 +01002345 length = PyUnicode_GET_LENGTH(unicode);
2346 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002347 if (!copy)
2348 return NULL;
2349 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2350
Christian Heimesf051e432016-09-13 20:22:02 +02002351 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002352 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002353 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002354 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002355}
2356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002357
Victor Stinnerbc603d12011-10-02 01:00:40 +02002358/* Widen Unicode objects to larger buffers. Don't write terminating null
2359 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360
2361void*
2362_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2363{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002364 Py_ssize_t len;
2365 void *result;
2366 unsigned int skind;
2367
Benjamin Petersonbac79492012-01-14 13:34:47 -05002368 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002369 return NULL;
2370
2371 len = PyUnicode_GET_LENGTH(s);
2372 skind = PyUnicode_KIND(s);
2373 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002374 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002375 return NULL;
2376 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002377 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002378 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002379 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002380 if (!result)
2381 return PyErr_NoMemory();
2382 assert(skind == PyUnicode_1BYTE_KIND);
2383 _PyUnicode_CONVERT_BYTES(
2384 Py_UCS1, Py_UCS2,
2385 PyUnicode_1BYTE_DATA(s),
2386 PyUnicode_1BYTE_DATA(s) + len,
2387 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002388 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002389 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002390 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002391 if (!result)
2392 return PyErr_NoMemory();
2393 if (skind == PyUnicode_2BYTE_KIND) {
2394 _PyUnicode_CONVERT_BYTES(
2395 Py_UCS2, Py_UCS4,
2396 PyUnicode_2BYTE_DATA(s),
2397 PyUnicode_2BYTE_DATA(s) + len,
2398 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002399 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002400 else {
2401 assert(skind == PyUnicode_1BYTE_KIND);
2402 _PyUnicode_CONVERT_BYTES(
2403 Py_UCS1, Py_UCS4,
2404 PyUnicode_1BYTE_DATA(s),
2405 PyUnicode_1BYTE_DATA(s) + len,
2406 result);
2407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002409 default:
2410 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 }
Victor Stinner01698042011-10-04 00:04:26 +02002412 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 return NULL;
2414}
2415
2416static Py_UCS4*
2417as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2418 int copy_null)
2419{
2420 int kind;
2421 void *data;
2422 Py_ssize_t len, targetlen;
2423 if (PyUnicode_READY(string) == -1)
2424 return NULL;
2425 kind = PyUnicode_KIND(string);
2426 data = PyUnicode_DATA(string);
2427 len = PyUnicode_GET_LENGTH(string);
2428 targetlen = len;
2429 if (copy_null)
2430 targetlen++;
2431 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002432 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 if (!target) {
2434 PyErr_NoMemory();
2435 return NULL;
2436 }
2437 }
2438 else {
2439 if (targetsize < targetlen) {
2440 PyErr_Format(PyExc_SystemError,
2441 "string is longer than the buffer");
2442 if (copy_null && 0 < targetsize)
2443 target[0] = 0;
2444 return NULL;
2445 }
2446 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002447 if (kind == PyUnicode_1BYTE_KIND) {
2448 Py_UCS1 *start = (Py_UCS1 *) data;
2449 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002451 else if (kind == PyUnicode_2BYTE_KIND) {
2452 Py_UCS2 *start = (Py_UCS2 *) data;
2453 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2454 }
2455 else {
2456 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002457 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002458 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 if (copy_null)
2460 target[len] = 0;
2461 return target;
2462}
2463
2464Py_UCS4*
2465PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2466 int copy_null)
2467{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002468 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 PyErr_BadInternalCall();
2470 return NULL;
2471 }
2472 return as_ucs4(string, target, targetsize, copy_null);
2473}
2474
2475Py_UCS4*
2476PyUnicode_AsUCS4Copy(PyObject *string)
2477{
2478 return as_ucs4(string, NULL, 0, 1);
2479}
2480
2481#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002482
Alexander Belopolsky40018472011-02-26 01:02:56 +00002483PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002484PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002488 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002489 PyErr_BadInternalCall();
2490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 }
2492
Martin v. Löwis790465f2008-04-05 20:41:37 +00002493 if (size == -1) {
2494 size = wcslen(w);
2495 }
2496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498}
2499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002501
Victor Stinner15a11362012-10-06 23:48:20 +02002502/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002503 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2504 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2505#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002506
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002507static int
2508unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2509 Py_ssize_t width, Py_ssize_t precision)
2510{
2511 Py_ssize_t length, fill, arglen;
2512 Py_UCS4 maxchar;
2513
2514 if (PyUnicode_READY(str) == -1)
2515 return -1;
2516
2517 length = PyUnicode_GET_LENGTH(str);
2518 if ((precision == -1 || precision >= length)
2519 && width <= length)
2520 return _PyUnicodeWriter_WriteStr(writer, str);
2521
2522 if (precision != -1)
2523 length = Py_MIN(precision, length);
2524
2525 arglen = Py_MAX(length, width);
2526 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2527 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2528 else
2529 maxchar = writer->maxchar;
2530
2531 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2532 return -1;
2533
2534 if (width > length) {
2535 fill = width - length;
2536 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2537 return -1;
2538 writer->pos += fill;
2539 }
2540
2541 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2542 str, 0, length);
2543 writer->pos += length;
2544 return 0;
2545}
2546
2547static int
2548unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2549 Py_ssize_t width, Py_ssize_t precision)
2550{
2551 /* UTF-8 */
2552 Py_ssize_t length;
2553 PyObject *unicode;
2554 int res;
2555
2556 length = strlen(str);
2557 if (precision != -1)
2558 length = Py_MIN(length, precision);
2559 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2560 if (unicode == NULL)
2561 return -1;
2562
2563 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2564 Py_DECREF(unicode);
2565 return res;
2566}
2567
Victor Stinner96865452011-03-01 23:44:09 +00002568static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002569unicode_fromformat_arg(_PyUnicodeWriter *writer,
2570 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002571{
Victor Stinnere215d962012-10-06 23:03:36 +02002572 const char *p;
2573 Py_ssize_t len;
2574 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002575 Py_ssize_t width;
2576 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002577 int longflag;
2578 int longlongflag;
2579 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002580 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002581
2582 p = f;
2583 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002584 zeropad = 0;
2585 if (*f == '0') {
2586 zeropad = 1;
2587 f++;
2588 }
Victor Stinner96865452011-03-01 23:44:09 +00002589
2590 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002591 width = -1;
2592 if (Py_ISDIGIT((unsigned)*f)) {
2593 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002594 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002595 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002596 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002597 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002598 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002599 return NULL;
2600 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002601 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002602 f++;
2603 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002604 }
2605 precision = -1;
2606 if (*f == '.') {
2607 f++;
2608 if (Py_ISDIGIT((unsigned)*f)) {
2609 precision = (*f - '0');
2610 f++;
2611 while (Py_ISDIGIT((unsigned)*f)) {
2612 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2613 PyErr_SetString(PyExc_ValueError,
2614 "precision too big");
2615 return NULL;
2616 }
2617 precision = (precision * 10) + (*f - '0');
2618 f++;
2619 }
2620 }
Victor Stinner96865452011-03-01 23:44:09 +00002621 if (*f == '%') {
2622 /* "%.3%s" => f points to "3" */
2623 f--;
2624 }
2625 }
2626 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002627 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002628 f--;
2629 }
Victor Stinner96865452011-03-01 23:44:09 +00002630
2631 /* Handle %ld, %lu, %lld and %llu. */
2632 longflag = 0;
2633 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002634 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002635 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002636 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002637 longflag = 1;
2638 ++f;
2639 }
Victor Stinner96865452011-03-01 23:44:09 +00002640 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002641 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002642 longlongflag = 1;
2643 f += 2;
2644 }
Victor Stinner96865452011-03-01 23:44:09 +00002645 }
2646 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002647 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002648 size_tflag = 1;
2649 ++f;
2650 }
Victor Stinnere215d962012-10-06 23:03:36 +02002651
2652 if (f[1] == '\0')
2653 writer->overallocate = 0;
2654
2655 switch (*f) {
2656 case 'c':
2657 {
2658 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002659 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002660 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002661 "character argument not in range(0x110000)");
2662 return NULL;
2663 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002664 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002665 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002666 break;
2667 }
2668
2669 case 'i':
2670 case 'd':
2671 case 'u':
2672 case 'x':
2673 {
2674 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002675 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002676 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002677
2678 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002679 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002680 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002681 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002682 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002683 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002684 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002685 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002686 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002687 va_arg(*vargs, size_t));
2688 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002689 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002690 va_arg(*vargs, unsigned int));
2691 }
2692 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002693 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002694 }
2695 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002696 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002697 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002698 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002699 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002700 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002701 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002702 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002703 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002704 va_arg(*vargs, Py_ssize_t));
2705 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002706 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002707 va_arg(*vargs, int));
2708 }
2709 assert(len >= 0);
2710
Victor Stinnere215d962012-10-06 23:03:36 +02002711 if (precision < len)
2712 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002713
2714 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002715 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2716 return NULL;
2717
Victor Stinnere215d962012-10-06 23:03:36 +02002718 if (width > precision) {
2719 Py_UCS4 fillchar;
2720 fill = width - precision;
2721 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002722 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2723 return NULL;
2724 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002725 }
Victor Stinner15a11362012-10-06 23:48:20 +02002726 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002727 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002728 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2729 return NULL;
2730 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002731 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002732
Victor Stinner4a587072013-11-19 12:54:53 +01002733 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2734 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002735 break;
2736 }
2737
2738 case 'p':
2739 {
2740 char number[MAX_LONG_LONG_CHARS];
2741
2742 len = sprintf(number, "%p", va_arg(*vargs, void*));
2743 assert(len >= 0);
2744
2745 /* %p is ill-defined: ensure leading 0x. */
2746 if (number[1] == 'X')
2747 number[1] = 'x';
2748 else if (number[1] != 'x') {
2749 memmove(number + 2, number,
2750 strlen(number) + 1);
2751 number[0] = '0';
2752 number[1] = 'x';
2753 len += 2;
2754 }
2755
Victor Stinner4a587072013-11-19 12:54:53 +01002756 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002757 return NULL;
2758 break;
2759 }
2760
2761 case 's':
2762 {
2763 /* UTF-8 */
2764 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002765 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002766 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002767 break;
2768 }
2769
2770 case 'U':
2771 {
2772 PyObject *obj = va_arg(*vargs, PyObject *);
2773 assert(obj && _PyUnicode_CHECK(obj));
2774
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002775 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002776 return NULL;
2777 break;
2778 }
2779
2780 case 'V':
2781 {
2782 PyObject *obj = va_arg(*vargs, PyObject *);
2783 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002784 if (obj) {
2785 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002786 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002787 return NULL;
2788 }
2789 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 assert(str != NULL);
2791 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002792 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002793 }
2794 break;
2795 }
2796
2797 case 'S':
2798 {
2799 PyObject *obj = va_arg(*vargs, PyObject *);
2800 PyObject *str;
2801 assert(obj);
2802 str = PyObject_Str(obj);
2803 if (!str)
2804 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002805 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002806 Py_DECREF(str);
2807 return NULL;
2808 }
2809 Py_DECREF(str);
2810 break;
2811 }
2812
2813 case 'R':
2814 {
2815 PyObject *obj = va_arg(*vargs, PyObject *);
2816 PyObject *repr;
2817 assert(obj);
2818 repr = PyObject_Repr(obj);
2819 if (!repr)
2820 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002821 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002822 Py_DECREF(repr);
2823 return NULL;
2824 }
2825 Py_DECREF(repr);
2826 break;
2827 }
2828
2829 case 'A':
2830 {
2831 PyObject *obj = va_arg(*vargs, PyObject *);
2832 PyObject *ascii;
2833 assert(obj);
2834 ascii = PyObject_ASCII(obj);
2835 if (!ascii)
2836 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002837 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002838 Py_DECREF(ascii);
2839 return NULL;
2840 }
2841 Py_DECREF(ascii);
2842 break;
2843 }
2844
2845 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002846 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002847 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002848 break;
2849
2850 default:
2851 /* if we stumble upon an unknown formatting code, copy the rest
2852 of the format string to the output string. (we cannot just
2853 skip the code, since there's no way to know what's in the
2854 argument list) */
2855 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002856 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002857 return NULL;
2858 f = p+len;
2859 return f;
2860 }
2861
2862 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002863 return f;
2864}
2865
Walter Dörwaldd2034312007-05-18 16:29:38 +00002866PyObject *
2867PyUnicode_FromFormatV(const char *format, va_list vargs)
2868{
Victor Stinnere215d962012-10-06 23:03:36 +02002869 va_list vargs2;
2870 const char *f;
2871 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002872
Victor Stinner8f674cc2013-04-17 23:02:17 +02002873 _PyUnicodeWriter_Init(&writer);
2874 writer.min_length = strlen(format) + 100;
2875 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002876
Benjamin Peterson0c212142016-09-20 20:39:33 -07002877 // Copy varags to be able to pass a reference to a subfunction.
2878 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002879
2880 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002881 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002882 f = unicode_fromformat_arg(&writer, f, &vargs2);
2883 if (f == NULL)
2884 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002885 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002886 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002887 const char *p;
2888 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002889
Victor Stinnere215d962012-10-06 23:03:36 +02002890 p = f;
2891 do
2892 {
2893 if ((unsigned char)*p > 127) {
2894 PyErr_Format(PyExc_ValueError,
2895 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2896 "string, got a non-ASCII byte: 0x%02x",
2897 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002898 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002899 }
2900 p++;
2901 }
2902 while (*p != '\0' && *p != '%');
2903 len = p - f;
2904
2905 if (*p == '\0')
2906 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002907
2908 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002909 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002910
2911 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002912 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002913 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002914 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002915 return _PyUnicodeWriter_Finish(&writer);
2916
2917 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002918 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002919 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002920 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002921}
2922
Walter Dörwaldd2034312007-05-18 16:29:38 +00002923PyObject *
2924PyUnicode_FromFormat(const char *format, ...)
2925{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002926 PyObject* ret;
2927 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002928
2929#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002930 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002931#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002932 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002933#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002934 ret = PyUnicode_FromFormatV(format, vargs);
2935 va_end(vargs);
2936 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002937}
2938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002939#ifdef HAVE_WCHAR_H
2940
Victor Stinner5593d8a2010-10-02 11:11:27 +00002941/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2942 convert a Unicode object to a wide character string.
2943
Victor Stinnerd88d9832011-09-06 02:00:05 +02002944 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002945 character) required to convert the unicode object. Ignore size argument.
2946
Victor Stinnerd88d9832011-09-06 02:00:05 +02002947 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002948 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002949 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002950static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002951unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002952 wchar_t *w,
2953 Py_ssize_t size)
2954{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002955 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002956 const wchar_t *wstr;
2957
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002958 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002959 if (wstr == NULL)
2960 return -1;
2961
Victor Stinner5593d8a2010-10-02 11:11:27 +00002962 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002963 if (size > res)
2964 size = res + 1;
2965 else
2966 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002967 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002968 return res;
2969 }
2970 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002971 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002972}
2973
2974Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002975PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002976 wchar_t *w,
2977 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978{
2979 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002980 PyErr_BadInternalCall();
2981 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002983 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984}
2985
Victor Stinner137c34c2010-09-29 10:25:54 +00002986wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002987PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002988 Py_ssize_t *size)
2989{
2990 wchar_t* buffer;
2991 Py_ssize_t buflen;
2992
2993 if (unicode == NULL) {
2994 PyErr_BadInternalCall();
2995 return NULL;
2996 }
2997
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002998 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002999 if (buflen == -1)
3000 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003001 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003002 if (buffer == NULL) {
3003 PyErr_NoMemory();
3004 return NULL;
3005 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003006 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003007 if (buflen == -1) {
3008 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003009 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003010 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003011 if (size != NULL)
3012 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003013 return buffer;
3014}
3015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003016#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017
Alexander Belopolsky40018472011-02-26 01:02:56 +00003018PyObject *
3019PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003020{
Victor Stinner8faf8212011-12-08 22:14:11 +01003021 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003022 PyErr_SetString(PyExc_ValueError,
3023 "chr() arg not in range(0x110000)");
3024 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003025 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003026
Victor Stinner985a82a2014-01-03 12:53:47 +01003027 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003028}
3029
Alexander Belopolsky40018472011-02-26 01:02:56 +00003030PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003031PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003033 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003034 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003035 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003036 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003037 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003038 Py_INCREF(obj);
3039 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003040 }
3041 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 /* For a Unicode subtype that's not a Unicode object,
3043 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003044 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003045 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003046 PyErr_Format(PyExc_TypeError,
3047 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003048 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003049 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003050}
3051
Alexander Belopolsky40018472011-02-26 01:02:56 +00003052PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003053PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003054 const char *encoding,
3055 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003056{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003057 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003058 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003059
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003061 PyErr_BadInternalCall();
3062 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003064
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003065 /* Decoding bytes objects is the most common case and should be fast */
3066 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003067 if (PyBytes_GET_SIZE(obj) == 0)
3068 _Py_RETURN_UNICODE_EMPTY();
3069 v = PyUnicode_Decode(
3070 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3071 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003072 return v;
3073 }
3074
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003075 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003076 PyErr_SetString(PyExc_TypeError,
3077 "decoding str is not supported");
3078 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003079 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003080
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003081 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3082 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3083 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003084 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003085 Py_TYPE(obj)->tp_name);
3086 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003087 }
Tim Petersced69f82003-09-16 20:30:58 +00003088
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003089 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003090 PyBuffer_Release(&buffer);
3091 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003093
Serhiy Storchaka05997252013-01-26 12:14:02 +02003094 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003095 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003096 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097}
3098
Victor Stinnerebe17e02016-10-12 13:57:45 +02003099/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3100 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3101 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003102int
3103_Py_normalize_encoding(const char *encoding,
3104 char *lower,
3105 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003107 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003108 char *l;
3109 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003110 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003111
Victor Stinner942889a2016-09-05 15:40:10 -07003112 assert(encoding != NULL);
3113
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003114 e = encoding;
3115 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003116 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003117 punct = 0;
3118 while (1) {
3119 char c = *e;
3120 if (c == 0) {
3121 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003122 }
Victor Stinner942889a2016-09-05 15:40:10 -07003123
3124 if (Py_ISALNUM(c) || c == '.') {
3125 if (punct && l != lower) {
3126 if (l == l_end) {
3127 return 0;
3128 }
3129 *l++ = '_';
3130 }
3131 punct = 0;
3132
3133 if (l == l_end) {
3134 return 0;
3135 }
3136 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003137 }
3138 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003139 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003140 }
Victor Stinner942889a2016-09-05 15:40:10 -07003141
3142 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003143 }
3144 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003145 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003146}
3147
Alexander Belopolsky40018472011-02-26 01:02:56 +00003148PyObject *
3149PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003150 Py_ssize_t size,
3151 const char *encoding,
3152 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003153{
3154 PyObject *buffer = NULL, *unicode;
3155 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003156 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3157
3158 if (encoding == NULL) {
3159 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3160 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003161
Fred Drakee4315f52000-05-09 19:53:39 +00003162 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003163 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3164 char *lower = buflower;
3165
3166 /* Fast paths */
3167 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3168 lower += 3;
3169 if (*lower == '_') {
3170 /* Match "utf8" and "utf_8" */
3171 lower++;
3172 }
3173
3174 if (lower[0] == '8' && lower[1] == 0) {
3175 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3176 }
3177 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3178 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3179 }
3180 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3181 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3182 }
3183 }
3184 else {
3185 if (strcmp(lower, "ascii") == 0
3186 || strcmp(lower, "us_ascii") == 0) {
3187 return PyUnicode_DecodeASCII(s, size, errors);
3188 }
Steve Dowercc16be82016-09-08 10:35:16 -07003189 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003190 else if (strcmp(lower, "mbcs") == 0) {
3191 return PyUnicode_DecodeMBCS(s, size, errors);
3192 }
3193 #endif
3194 else if (strcmp(lower, "latin1") == 0
3195 || strcmp(lower, "latin_1") == 0
3196 || strcmp(lower, "iso_8859_1") == 0
3197 || strcmp(lower, "iso8859_1") == 0) {
3198 return PyUnicode_DecodeLatin1(s, size, errors);
3199 }
3200 }
Victor Stinner37296e82010-06-10 13:36:23 +00003201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202
3203 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003204 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003205 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003206 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003207 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 if (buffer == NULL)
3209 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003210 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 if (unicode == NULL)
3212 goto onError;
3213 if (!PyUnicode_Check(unicode)) {
3214 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003215 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3216 "use codecs.decode() to decode to arbitrary types",
3217 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003218 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219 Py_DECREF(unicode);
3220 goto onError;
3221 }
3222 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003223 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003224
Benjamin Peterson29060642009-01-31 22:14:21 +00003225 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 Py_XDECREF(buffer);
3227 return NULL;
3228}
3229
Alexander Belopolsky40018472011-02-26 01:02:56 +00003230PyObject *
3231PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003232 const char *encoding,
3233 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003234{
3235 PyObject *v;
3236
3237 if (!PyUnicode_Check(unicode)) {
3238 PyErr_BadArgument();
3239 goto onError;
3240 }
3241
3242 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003243 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003244
3245 /* Decode via the codec registry */
3246 v = PyCodec_Decode(unicode, encoding, errors);
3247 if (v == NULL)
3248 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003249 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003250
Benjamin Peterson29060642009-01-31 22:14:21 +00003251 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003252 return NULL;
3253}
3254
Alexander Belopolsky40018472011-02-26 01:02:56 +00003255PyObject *
3256PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003257 const char *encoding,
3258 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003259{
3260 PyObject *v;
3261
3262 if (!PyUnicode_Check(unicode)) {
3263 PyErr_BadArgument();
3264 goto onError;
3265 }
3266
3267 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003268 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003269
3270 /* Decode via the codec registry */
3271 v = PyCodec_Decode(unicode, encoding, errors);
3272 if (v == NULL)
3273 goto onError;
3274 if (!PyUnicode_Check(v)) {
3275 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003276 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3277 "use codecs.decode() to decode to arbitrary types",
3278 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003279 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003280 Py_DECREF(v);
3281 goto onError;
3282 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003283 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003284
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003286 return NULL;
3287}
3288
Alexander Belopolsky40018472011-02-26 01:02:56 +00003289PyObject *
3290PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003291 Py_ssize_t size,
3292 const char *encoding,
3293 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294{
3295 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003296
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297 unicode = PyUnicode_FromUnicode(s, size);
3298 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3301 Py_DECREF(unicode);
3302 return v;
3303}
3304
Alexander Belopolsky40018472011-02-26 01:02:56 +00003305PyObject *
3306PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003307 const char *encoding,
3308 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003309{
3310 PyObject *v;
3311
3312 if (!PyUnicode_Check(unicode)) {
3313 PyErr_BadArgument();
3314 goto onError;
3315 }
3316
3317 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003318 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003319
3320 /* Encode via the codec registry */
3321 v = PyCodec_Encode(unicode, encoding, errors);
3322 if (v == NULL)
3323 goto onError;
3324 return v;
3325
Benjamin Peterson29060642009-01-31 22:14:21 +00003326 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003327 return NULL;
3328}
3329
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003330static size_t
3331wcstombs_errorpos(const wchar_t *wstr)
3332{
3333 size_t len;
3334#if SIZEOF_WCHAR_T == 2
3335 wchar_t buf[3];
3336#else
3337 wchar_t buf[2];
3338#endif
3339 char outbuf[MB_LEN_MAX];
3340 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003341
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003342#if SIZEOF_WCHAR_T == 2
3343 buf[2] = 0;
3344#else
3345 buf[1] = 0;
3346#endif
3347 start = wstr;
3348 while (*wstr != L'\0')
3349 {
3350 previous = wstr;
3351#if SIZEOF_WCHAR_T == 2
3352 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3353 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3354 {
3355 buf[0] = wstr[0];
3356 buf[1] = wstr[1];
3357 wstr += 2;
3358 }
3359 else {
3360 buf[0] = *wstr;
3361 buf[1] = 0;
3362 wstr++;
3363 }
3364#else
3365 buf[0] = *wstr;
3366 wstr++;
3367#endif
3368 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003369 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003370 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003371 }
3372
3373 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003374 return 0;
3375}
3376
Victor Stinner1b579672011-12-17 05:47:23 +01003377static int
3378locale_error_handler(const char *errors, int *surrogateescape)
3379{
Victor Stinner50149202015-09-22 00:26:54 +02003380 _Py_error_handler error_handler = get_error_handler(errors);
3381 switch (error_handler)
3382 {
3383 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003384 *surrogateescape = 0;
3385 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003386 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003387 *surrogateescape = 1;
3388 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003389 default:
3390 PyErr_Format(PyExc_ValueError,
3391 "only 'strict' and 'surrogateescape' error handlers "
3392 "are supported, not '%s'",
3393 errors);
3394 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003395 }
Victor Stinner1b579672011-12-17 05:47:23 +01003396}
3397
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003398PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003399PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003400{
3401 Py_ssize_t wlen, wlen2;
3402 wchar_t *wstr;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003403 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003404 PyObject *bytes, *reason, *exc;
3405 size_t error_pos, errlen;
Victor Stinner1b579672011-12-17 05:47:23 +01003406 int surrogateescape;
3407
3408 if (locale_error_handler(errors, &surrogateescape) < 0)
3409 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003410
3411 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3412 if (wstr == NULL)
3413 return NULL;
3414
3415 wlen2 = wcslen(wstr);
3416 if (wlen2 != wlen) {
3417 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003418 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003419 return NULL;
3420 }
3421
3422 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003423 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003424 char *str;
3425
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003426 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003427 if (str == NULL) {
3428 if (error_pos == (size_t)-1) {
3429 PyErr_NoMemory();
3430 PyMem_Free(wstr);
3431 return NULL;
3432 }
3433 else {
3434 goto encode_error;
3435 }
3436 }
3437 PyMem_Free(wstr);
3438
3439 bytes = PyBytes_FromString(str);
3440 PyMem_Free(str);
3441 }
3442 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003443 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003444 size_t len, len2;
3445
3446 len = wcstombs(NULL, wstr, 0);
3447 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003448 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003449 goto encode_error;
3450 }
3451
3452 bytes = PyBytes_FromStringAndSize(NULL, len);
3453 if (bytes == NULL) {
3454 PyMem_Free(wstr);
3455 return NULL;
3456 }
3457
3458 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3459 if (len2 == (size_t)-1 || len2 > len) {
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003460 Py_DECREF(bytes);
Victor Stinner2f197072011-12-17 07:08:30 +01003461 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003462 goto encode_error;
3463 }
3464 PyMem_Free(wstr);
3465 }
3466 return bytes;
3467
3468encode_error:
3469 errmsg = strerror(errno);
3470 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003471
3472 if (error_pos == (size_t)-1)
3473 error_pos = wcstombs_errorpos(wstr);
3474
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003475 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003476
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003477 wstr = Py_DecodeLocale(errmsg, &errlen);
3478 if (wstr != NULL) {
3479 reason = PyUnicode_FromWideChar(wstr, errlen);
3480 PyMem_RawFree(wstr);
3481 } else {
3482 errmsg = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003483 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003484
Victor Stinner2f197072011-12-17 07:08:30 +01003485 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003486 reason = PyUnicode_FromString(
3487 "wcstombs() encountered an unencodable "
3488 "wide character");
3489 if (reason == NULL)
3490 return NULL;
3491
3492 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3493 "locale", unicode,
3494 (Py_ssize_t)error_pos,
3495 (Py_ssize_t)(error_pos+1),
3496 reason);
3497 Py_DECREF(reason);
3498 if (exc != NULL) {
3499 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003500 Py_DECREF(exc);
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003501 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003502 return NULL;
3503}
3504
Victor Stinnerad158722010-10-27 00:25:46 +00003505PyObject *
3506PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003507{
Steve Dowercc16be82016-09-08 10:35:16 -07003508#if defined(__APPLE__)
3509 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003510#else
Victor Stinner793b5312011-04-27 00:24:21 +02003511 PyInterpreterState *interp = PyThreadState_GET()->interp;
3512 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3513 cannot use it to encode and decode filenames before it is loaded. Load
3514 the Python codec requires to encode at least its own filename. Use the C
3515 version of the locale codec until the codec registry is initialized and
3516 the Python codec is loaded.
3517
3518 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3519 cannot only rely on it: check also interp->fscodec_initialized for
3520 subinterpreters. */
3521 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003522 return PyUnicode_AsEncodedString(unicode,
3523 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003524 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003525 }
3526 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003527 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003528 }
Victor Stinnerad158722010-10-27 00:25:46 +00003529#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003530}
3531
Alexander Belopolsky40018472011-02-26 01:02:56 +00003532PyObject *
3533PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003534 const char *encoding,
3535 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536{
3537 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003538 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003539
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 if (!PyUnicode_Check(unicode)) {
3541 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003542 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543 }
Fred Drakee4315f52000-05-09 19:53:39 +00003544
Victor Stinner942889a2016-09-05 15:40:10 -07003545 if (encoding == NULL) {
3546 return _PyUnicode_AsUTF8String(unicode, errors);
3547 }
3548
Fred Drakee4315f52000-05-09 19:53:39 +00003549 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003550 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3551 char *lower = buflower;
3552
3553 /* Fast paths */
3554 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3555 lower += 3;
3556 if (*lower == '_') {
3557 /* Match "utf8" and "utf_8" */
3558 lower++;
3559 }
3560
3561 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003562 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003563 }
3564 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3565 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3566 }
3567 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3568 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3569 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003570 }
Victor Stinner942889a2016-09-05 15:40:10 -07003571 else {
3572 if (strcmp(lower, "ascii") == 0
3573 || strcmp(lower, "us_ascii") == 0) {
3574 return _PyUnicode_AsASCIIString(unicode, errors);
3575 }
Steve Dowercc16be82016-09-08 10:35:16 -07003576#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003577 else if (strcmp(lower, "mbcs") == 0) {
3578 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3579 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003580#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003581 else if (strcmp(lower, "latin1") == 0 ||
3582 strcmp(lower, "latin_1") == 0 ||
3583 strcmp(lower, "iso_8859_1") == 0 ||
3584 strcmp(lower, "iso8859_1") == 0) {
3585 return _PyUnicode_AsLatin1String(unicode, errors);
3586 }
3587 }
Victor Stinner37296e82010-06-10 13:36:23 +00003588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589
3590 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003591 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003593 return NULL;
3594
3595 /* The normal path */
3596 if (PyBytes_Check(v))
3597 return v;
3598
3599 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003600 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003601 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003602 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003603
3604 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003605 "encoder %s returned bytearray instead of bytes; "
3606 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003607 encoding);
3608 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003609 Py_DECREF(v);
3610 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003611 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003612
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003613 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3614 Py_DECREF(v);
3615 return b;
3616 }
3617
3618 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003619 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3620 "use codecs.encode() to encode to arbitrary types",
3621 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003622 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003623 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003624 return NULL;
3625}
3626
Alexander Belopolsky40018472011-02-26 01:02:56 +00003627PyObject *
3628PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003629 const char *encoding,
3630 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003631{
3632 PyObject *v;
3633
3634 if (!PyUnicode_Check(unicode)) {
3635 PyErr_BadArgument();
3636 goto onError;
3637 }
3638
3639 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003640 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003641
3642 /* Encode via the codec registry */
3643 v = PyCodec_Encode(unicode, encoding, errors);
3644 if (v == NULL)
3645 goto onError;
3646 if (!PyUnicode_Check(v)) {
3647 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003648 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3649 "use codecs.encode() to encode to arbitrary types",
3650 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003651 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003652 Py_DECREF(v);
3653 goto onError;
3654 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003656
Benjamin Peterson29060642009-01-31 22:14:21 +00003657 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 return NULL;
3659}
3660
Victor Stinner2f197072011-12-17 07:08:30 +01003661static size_t
3662mbstowcs_errorpos(const char *str, size_t len)
3663{
3664#ifdef HAVE_MBRTOWC
3665 const char *start = str;
3666 mbstate_t mbs;
3667 size_t converted;
3668 wchar_t ch;
3669
3670 memset(&mbs, 0, sizeof mbs);
3671 while (len)
3672 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003673 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003674 if (converted == 0)
3675 /* Reached end of string */
3676 break;
3677 if (converted == (size_t)-1 || converted == (size_t)-2) {
3678 /* Conversion error or incomplete character */
3679 return str - start;
3680 }
3681 else {
3682 str += converted;
3683 len -= converted;
3684 }
3685 }
3686 /* failed to find the undecodable byte sequence */
3687 return 0;
3688#endif
3689 return 0;
3690}
3691
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003692PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003693PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003694 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003695{
3696 wchar_t smallbuf[256];
3697 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3698 wchar_t *wstr;
3699 size_t wlen, wlen2;
3700 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003701 int surrogateescape;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003702 size_t error_pos, errlen;
Victor Stinner2f197072011-12-17 07:08:30 +01003703 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003704 PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
Victor Stinner1b579672011-12-17 05:47:23 +01003705
3706 if (locale_error_handler(errors, &surrogateescape) < 0)
3707 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003708
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003709 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3710 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003711 return NULL;
3712 }
3713
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003714 if (surrogateescape) {
3715 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003716 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003717 if (wstr == NULL) {
3718 if (wlen == (size_t)-1)
3719 PyErr_NoMemory();
3720 else
3721 PyErr_SetFromErrno(PyExc_OSError);
3722 return NULL;
3723 }
3724
3725 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003726 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003727 }
3728 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003729 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003730#ifndef HAVE_BROKEN_MBSTOWCS
3731 wlen = mbstowcs(NULL, str, 0);
3732#else
3733 wlen = len;
3734#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003735 if (wlen == (size_t)-1)
3736 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003737 if (wlen+1 <= smallbuf_len) {
3738 wstr = smallbuf;
3739 }
3740 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003741 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003742 if (!wstr)
3743 return PyErr_NoMemory();
3744 }
3745
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003746 wlen2 = mbstowcs(wstr, str, wlen+1);
3747 if (wlen2 == (size_t)-1) {
3748 if (wstr != smallbuf)
3749 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003750 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003751 }
3752#ifdef HAVE_BROKEN_MBSTOWCS
3753 assert(wlen2 == wlen);
3754#endif
3755 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3756 if (wstr != smallbuf)
3757 PyMem_Free(wstr);
3758 }
3759 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003760
3761decode_error:
3762 errmsg = strerror(errno);
3763 assert(errmsg != NULL);
3764
3765 error_pos = mbstowcs_errorpos(str, len);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003766 wstr = Py_DecodeLocale(errmsg, &errlen);
3767 if (wstr != NULL) {
3768 reason = PyUnicode_FromWideChar(wstr, errlen);
3769 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003770 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003771
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003772 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003773 reason = PyUnicode_FromString(
3774 "mbstowcs() encountered an invalid multibyte sequence");
3775 if (reason == NULL)
3776 return NULL;
3777
3778 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3779 "locale", str, len,
3780 (Py_ssize_t)error_pos,
3781 (Py_ssize_t)(error_pos+1),
3782 reason);
3783 Py_DECREF(reason);
3784 if (exc != NULL) {
3785 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003786 Py_DECREF(exc);
Victor Stinner2f197072011-12-17 07:08:30 +01003787 }
3788 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003789}
3790
3791PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003792PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003793{
3794 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003795 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003796}
3797
3798
3799PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003800PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003801 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003802 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3803}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003804
Christian Heimes5894ba72007-11-04 11:43:14 +00003805PyObject*
3806PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3807{
Steve Dowercc16be82016-09-08 10:35:16 -07003808#if defined(__APPLE__)
3809 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003810#else
Victor Stinner793b5312011-04-27 00:24:21 +02003811 PyInterpreterState *interp = PyThreadState_GET()->interp;
3812 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3813 cannot use it to encode and decode filenames before it is loaded. Load
3814 the Python codec requires to encode at least its own filename. Use the C
3815 version of the locale codec until the codec registry is initialized and
3816 the Python codec is loaded.
3817
3818 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3819 cannot only rely on it: check also interp->fscodec_initialized for
3820 subinterpreters. */
3821 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dowercc16be82016-09-08 10:35:16 -07003822 PyObject *res = PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003823 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003824 Py_FileSystemDefaultEncodeErrors);
3825#ifdef MS_WINDOWS
3826 if (!res && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
Serhiy Storchaka467ab192016-10-21 17:09:17 +03003827 _PyErr_FormatFromCause(PyExc_RuntimeError,
3828 "filesystem path bytes were not correctly encoded with '%s'. "
Steve Dowercc16be82016-09-08 10:35:16 -07003829 "Please report this at http://bugs.python.org/issue27781",
3830 Py_FileSystemDefaultEncoding);
Steve Dowercc16be82016-09-08 10:35:16 -07003831 }
3832#endif
3833 return res;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003834 }
3835 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003836 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003837 }
Victor Stinnerad158722010-10-27 00:25:46 +00003838#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003839}
3840
Martin v. Löwis011e8422009-05-05 04:43:17 +00003841
3842int
3843PyUnicode_FSConverter(PyObject* arg, void* addr)
3844{
Brett Cannonec6ce872016-09-06 15:50:29 -07003845 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003846 PyObject *output = NULL;
3847 Py_ssize_t size;
3848 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003849 if (arg == NULL) {
3850 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003851 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003852 return 1;
3853 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003854 path = PyOS_FSPath(arg);
3855 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003856 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003857 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003858 if (PyBytes_Check(path)) {
3859 output = path;
3860 }
3861 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3862 output = PyUnicode_EncodeFSDefault(path);
3863 Py_DECREF(path);
3864 if (!output) {
3865 return 0;
3866 }
3867 assert(PyBytes_Check(output));
3868 }
3869
Victor Stinner0ea2a462010-04-30 00:22:08 +00003870 size = PyBytes_GET_SIZE(output);
3871 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003872 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003873 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003874 Py_DECREF(output);
3875 return 0;
3876 }
3877 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003878 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003879}
3880
3881
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003882int
3883PyUnicode_FSDecoder(PyObject* arg, void* addr)
3884{
Brett Cannona5711202016-09-06 19:36:01 -07003885 int is_buffer = 0;
3886 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003887 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003888 if (arg == NULL) {
3889 Py_DECREF(*(PyObject**)addr);
3890 return 1;
3891 }
Brett Cannona5711202016-09-06 19:36:01 -07003892
3893 is_buffer = PyObject_CheckBuffer(arg);
3894 if (!is_buffer) {
3895 path = PyOS_FSPath(arg);
3896 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003897 return 0;
3898 }
Brett Cannona5711202016-09-06 19:36:01 -07003899 }
3900 else {
3901 path = arg;
3902 Py_INCREF(arg);
3903 }
3904
3905 if (PyUnicode_Check(path)) {
3906 if (PyUnicode_READY(path) == -1) {
3907 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003908 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003909 }
3910 output = path;
3911 }
3912 else if (PyBytes_Check(path) || is_buffer) {
3913 PyObject *path_bytes = NULL;
3914
3915 if (!PyBytes_Check(path) &&
3916 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3917 "path should be string, bytes, or os.PathLike, not %.200s",
3918 Py_TYPE(arg)->tp_name)) {
3919 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003920 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003921 }
3922 path_bytes = PyBytes_FromObject(path);
3923 Py_DECREF(path);
3924 if (!path_bytes) {
3925 return 0;
3926 }
3927 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3928 PyBytes_GET_SIZE(path_bytes));
3929 Py_DECREF(path_bytes);
3930 if (!output) {
3931 return 0;
3932 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003933 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003934 else {
3935 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003936 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003937 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003938 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003939 return 0;
3940 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003941 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003942 Py_DECREF(output);
3943 return 0;
3944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003945 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003946 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003947 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003948 Py_DECREF(output);
3949 return 0;
3950 }
3951 *(PyObject**)addr = output;
3952 return Py_CLEANUP_SUPPORTED;
3953}
3954
3955
Martin v. Löwis5b222132007-06-10 09:51:05 +00003956char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003957PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003958{
Christian Heimesf3863112007-11-22 07:46:41 +00003959 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003961 if (!PyUnicode_Check(unicode)) {
3962 PyErr_BadArgument();
3963 return NULL;
3964 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003965 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003966 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003967
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003968 if (PyUnicode_UTF8(unicode) == NULL) {
3969 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003970 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003971 if (bytes == NULL)
3972 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003973 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3974 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003975 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976 Py_DECREF(bytes);
3977 return NULL;
3978 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003979 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003980 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003981 PyBytes_AS_STRING(bytes),
3982 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003983 Py_DECREF(bytes);
3984 }
3985
3986 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003987 *psize = PyUnicode_UTF8_LENGTH(unicode);
3988 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003989}
3990
3991char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003993{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003994 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3995}
3996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003997Py_UNICODE *
3998PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3999{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004000 const unsigned char *one_byte;
4001#if SIZEOF_WCHAR_T == 4
4002 const Py_UCS2 *two_bytes;
4003#else
4004 const Py_UCS4 *four_bytes;
4005 const Py_UCS4 *ucs4_end;
4006 Py_ssize_t num_surrogates;
4007#endif
4008 wchar_t *w;
4009 wchar_t *wchar_end;
4010
4011 if (!PyUnicode_Check(unicode)) {
4012 PyErr_BadArgument();
4013 return NULL;
4014 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004015 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004017 assert(_PyUnicode_KIND(unicode) != 0);
4018 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004020 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004021#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004022 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4023 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004024 num_surrogates = 0;
4025
4026 for (; four_bytes < ucs4_end; ++four_bytes) {
4027 if (*four_bytes > 0xFFFF)
4028 ++num_surrogates;
4029 }
4030
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004031 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4032 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4033 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004034 PyErr_NoMemory();
4035 return NULL;
4036 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004037 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004039 w = _PyUnicode_WSTR(unicode);
4040 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4041 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4043 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004044 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004046 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4047 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048 }
4049 else
4050 *w = *four_bytes;
4051
4052 if (w > wchar_end) {
4053 assert(0 && "Miscalculated string end");
4054 }
4055 }
4056 *w = 0;
4057#else
4058 /* sizeof(wchar_t) == 4 */
4059 Py_FatalError("Impossible unicode object state, wstr and str "
4060 "should share memory already.");
4061 return NULL;
4062#endif
4063 }
4064 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004065 if ((size_t)_PyUnicode_LENGTH(unicode) >
4066 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4067 PyErr_NoMemory();
4068 return NULL;
4069 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004070 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4071 (_PyUnicode_LENGTH(unicode) + 1));
4072 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004073 PyErr_NoMemory();
4074 return NULL;
4075 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004076 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4077 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4078 w = _PyUnicode_WSTR(unicode);
4079 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004080
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004081 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4082 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083 for (; w < wchar_end; ++one_byte, ++w)
4084 *w = *one_byte;
4085 /* null-terminate the wstr */
4086 *w = 0;
4087 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004088 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004089#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004090 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004091 for (; w < wchar_end; ++two_bytes, ++w)
4092 *w = *two_bytes;
4093 /* null-terminate the wstr */
4094 *w = 0;
4095#else
4096 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004097 PyObject_FREE(_PyUnicode_WSTR(unicode));
4098 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004099 Py_FatalError("Impossible unicode object state, wstr "
4100 "and str should share memory already.");
4101 return NULL;
4102#endif
4103 }
4104 else {
4105 assert(0 && "This should never happen.");
4106 }
4107 }
4108 }
4109 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004110 *size = PyUnicode_WSTR_LENGTH(unicode);
4111 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004112}
4113
Alexander Belopolsky40018472011-02-26 01:02:56 +00004114Py_UNICODE *
4115PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004117 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118}
4119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004120
Alexander Belopolsky40018472011-02-26 01:02:56 +00004121Py_ssize_t
4122PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123{
4124 if (!PyUnicode_Check(unicode)) {
4125 PyErr_BadArgument();
4126 goto onError;
4127 }
4128 return PyUnicode_GET_SIZE(unicode);
4129
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 return -1;
4132}
4133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004134Py_ssize_t
4135PyUnicode_GetLength(PyObject *unicode)
4136{
Victor Stinner07621332012-06-16 04:53:46 +02004137 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004138 PyErr_BadArgument();
4139 return -1;
4140 }
Victor Stinner07621332012-06-16 04:53:46 +02004141 if (PyUnicode_READY(unicode) == -1)
4142 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004143 return PyUnicode_GET_LENGTH(unicode);
4144}
4145
4146Py_UCS4
4147PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4148{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004149 void *data;
4150 int kind;
4151
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004152 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4153 PyErr_BadArgument();
4154 return (Py_UCS4)-1;
4155 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004156 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004157 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004158 return (Py_UCS4)-1;
4159 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004160 data = PyUnicode_DATA(unicode);
4161 kind = PyUnicode_KIND(unicode);
4162 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004163}
4164
4165int
4166PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4167{
4168 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004169 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004170 return -1;
4171 }
Victor Stinner488fa492011-12-12 00:01:39 +01004172 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004173 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004174 PyErr_SetString(PyExc_IndexError, "string index out of range");
4175 return -1;
4176 }
Victor Stinner488fa492011-12-12 00:01:39 +01004177 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004178 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004179 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4180 PyErr_SetString(PyExc_ValueError, "character out of range");
4181 return -1;
4182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004183 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4184 index, ch);
4185 return 0;
4186}
4187
Alexander Belopolsky40018472011-02-26 01:02:56 +00004188const char *
4189PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004190{
Victor Stinner42cb4622010-09-01 19:39:01 +00004191 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004192}
4193
Victor Stinner554f3f02010-06-16 23:33:54 +00004194/* create or adjust a UnicodeDecodeError */
4195static void
4196make_decode_exception(PyObject **exceptionObject,
4197 const char *encoding,
4198 const char *input, Py_ssize_t length,
4199 Py_ssize_t startpos, Py_ssize_t endpos,
4200 const char *reason)
4201{
4202 if (*exceptionObject == NULL) {
4203 *exceptionObject = PyUnicodeDecodeError_Create(
4204 encoding, input, length, startpos, endpos, reason);
4205 }
4206 else {
4207 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4208 goto onError;
4209 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4210 goto onError;
4211 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4212 goto onError;
4213 }
4214 return;
4215
4216onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004217 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004218}
4219
Steve Dowercc16be82016-09-08 10:35:16 -07004220#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004221/* error handling callback helper:
4222 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004223 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004224 and adjust various state variables.
4225 return 0 on success, -1 on error
4226*/
4227
Alexander Belopolsky40018472011-02-26 01:02:56 +00004228static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004229unicode_decode_call_errorhandler_wchar(
4230 const char *errors, PyObject **errorHandler,
4231 const char *encoding, const char *reason,
4232 const char **input, const char **inend, Py_ssize_t *startinpos,
4233 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4234 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004236 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004237
4238 PyObject *restuple = NULL;
4239 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004240 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004241 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004242 Py_ssize_t requiredsize;
4243 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004244 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004245 wchar_t *repwstr;
4246 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004248 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4249 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004250
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004252 *errorHandler = PyCodec_LookupError(errors);
4253 if (*errorHandler == NULL)
4254 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004255 }
4256
Victor Stinner554f3f02010-06-16 23:33:54 +00004257 make_decode_exception(exceptionObject,
4258 encoding,
4259 *input, *inend - *input,
4260 *startinpos, *endinpos,
4261 reason);
4262 if (*exceptionObject == NULL)
4263 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004264
4265 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4266 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004267 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004269 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004270 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004272 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004273 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004274
4275 /* Copy back the bytes variables, which might have been modified by the
4276 callback */
4277 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4278 if (!inputobj)
4279 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004280 *input = PyBytes_AS_STRING(inputobj);
4281 insize = PyBytes_GET_SIZE(inputobj);
4282 *inend = *input + insize;
4283 /* we can DECREF safely, as the exception has another reference,
4284 so the object won't go away. */
4285 Py_DECREF(inputobj);
4286
4287 if (newpos<0)
4288 newpos = insize+newpos;
4289 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004290 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004291 goto onError;
4292 }
4293
4294 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4295 if (repwstr == NULL)
4296 goto onError;
4297 /* need more space? (at least enough for what we
4298 have+the replacement+the rest of the string (starting
4299 at the new input position), so we won't have to check space
4300 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004301 requiredsize = *outpos;
4302 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4303 goto overflow;
4304 requiredsize += repwlen;
4305 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4306 goto overflow;
4307 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004308 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004309 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004310 requiredsize = 2*outsize;
4311 if (unicode_resize(output, requiredsize) < 0)
4312 goto onError;
4313 }
4314 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4315 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004316 *endinpos = newpos;
4317 *inptr = *input + newpos;
4318
4319 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004320 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004321 return 0;
4322
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004323 overflow:
4324 PyErr_SetString(PyExc_OverflowError,
4325 "decoded result is too long for a Python string");
4326
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004327 onError:
4328 Py_XDECREF(restuple);
4329 return -1;
4330}
Steve Dowercc16be82016-09-08 10:35:16 -07004331#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004332
4333static int
4334unicode_decode_call_errorhandler_writer(
4335 const char *errors, PyObject **errorHandler,
4336 const char *encoding, const char *reason,
4337 const char **input, const char **inend, Py_ssize_t *startinpos,
4338 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4339 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4340{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004341 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004342
4343 PyObject *restuple = NULL;
4344 PyObject *repunicode = NULL;
4345 Py_ssize_t insize;
4346 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004347 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004348 PyObject *inputobj = NULL;
4349
4350 if (*errorHandler == NULL) {
4351 *errorHandler = PyCodec_LookupError(errors);
4352 if (*errorHandler == NULL)
4353 goto onError;
4354 }
4355
4356 make_decode_exception(exceptionObject,
4357 encoding,
4358 *input, *inend - *input,
4359 *startinpos, *endinpos,
4360 reason);
4361 if (*exceptionObject == NULL)
4362 goto onError;
4363
4364 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4365 if (restuple == NULL)
4366 goto onError;
4367 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004368 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004369 goto onError;
4370 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004371 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004372 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004373
4374 /* Copy back the bytes variables, which might have been modified by the
4375 callback */
4376 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4377 if (!inputobj)
4378 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004379 *input = PyBytes_AS_STRING(inputobj);
4380 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004381 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004382 /* we can DECREF safely, as the exception has another reference,
4383 so the object won't go away. */
4384 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004385
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004387 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004388 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004389 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004390 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004391 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004392
Victor Stinner170ca6f2013-04-18 00:25:28 +02004393 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004394 if (replen > 1) {
4395 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004396 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004397 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4398 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4399 goto onError;
4400 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004401 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004402 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004403
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004405 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004406
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004408 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004409 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410
Benjamin Peterson29060642009-01-31 22:14:21 +00004411 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004413 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004414}
4415
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416/* --- UTF-7 Codec -------------------------------------------------------- */
4417
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418/* See RFC2152 for details. We encode conservatively and decode liberally. */
4419
4420/* Three simple macros defining base-64. */
4421
4422/* Is c a base-64 character? */
4423
4424#define IS_BASE64(c) \
4425 (((c) >= 'A' && (c) <= 'Z') || \
4426 ((c) >= 'a' && (c) <= 'z') || \
4427 ((c) >= '0' && (c) <= '9') || \
4428 (c) == '+' || (c) == '/')
4429
4430/* given that c is a base-64 character, what is its base-64 value? */
4431
4432#define FROM_BASE64(c) \
4433 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4434 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4435 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4436 (c) == '+' ? 62 : 63)
4437
4438/* What is the base-64 character of the bottom 6 bits of n? */
4439
4440#define TO_BASE64(n) \
4441 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4442
4443/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4444 * decoded as itself. We are permissive on decoding; the only ASCII
4445 * byte not decoding to itself is the + which begins a base64
4446 * string. */
4447
4448#define DECODE_DIRECT(c) \
4449 ((c) <= 127 && (c) != '+')
4450
4451/* The UTF-7 encoder treats ASCII characters differently according to
4452 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4453 * the above). See RFC2152. This array identifies these different
4454 * sets:
4455 * 0 : "Set D"
4456 * alphanumeric and '(),-./:?
4457 * 1 : "Set O"
4458 * !"#$%&*;<=>@[]^_`{|}
4459 * 2 : "whitespace"
4460 * ht nl cr sp
4461 * 3 : special (must be base64 encoded)
4462 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4463 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004464
Tim Petersced69f82003-09-16 20:30:58 +00004465static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466char utf7_category[128] = {
4467/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4468 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4469/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4470 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4471/* sp ! " # $ % & ' ( ) * + , - . / */
4472 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4473/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4474 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4475/* @ A B C D E F G H I J K L M N O */
4476 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4477/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4478 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4479/* ` a b c d e f g h i j k l m n o */
4480 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4481/* p q r s t u v w x y z { | } ~ del */
4482 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004483};
4484
Antoine Pitrou244651a2009-05-04 18:56:13 +00004485/* ENCODE_DIRECT: this character should be encoded as itself. The
4486 * answer depends on whether we are encoding set O as itself, and also
4487 * on whether we are encoding whitespace as itself. RFC2152 makes it
4488 * clear that the answers to these questions vary between
4489 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004490
Antoine Pitrou244651a2009-05-04 18:56:13 +00004491#define ENCODE_DIRECT(c, directO, directWS) \
4492 ((c) < 128 && (c) > 0 && \
4493 ((utf7_category[(c)] == 0) || \
4494 (directWS && (utf7_category[(c)] == 2)) || \
4495 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004496
Alexander Belopolsky40018472011-02-26 01:02:56 +00004497PyObject *
4498PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004499 Py_ssize_t size,
4500 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004501{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004502 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4503}
4504
Antoine Pitrou244651a2009-05-04 18:56:13 +00004505/* The decoder. The only state we preserve is our read position,
4506 * i.e. how many characters we have consumed. So if we end in the
4507 * middle of a shift sequence we have to back off the read position
4508 * and the output to the beginning of the sequence, otherwise we lose
4509 * all the shift state (seen bits, number of bits seen, high
4510 * surrogate). */
4511
Alexander Belopolsky40018472011-02-26 01:02:56 +00004512PyObject *
4513PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004514 Py_ssize_t size,
4515 const char *errors,
4516 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004517{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004519 Py_ssize_t startinpos;
4520 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004522 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004523 const char *errmsg = "";
4524 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004525 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004526 unsigned int base64bits = 0;
4527 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004528 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529 PyObject *errorHandler = NULL;
4530 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004531
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004532 if (size == 0) {
4533 if (consumed)
4534 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004535 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004536 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004537
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004538 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004539 _PyUnicodeWriter_Init(&writer);
4540 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004541
4542 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004543 e = s + size;
4544
4545 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004546 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004547 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004548 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004549
Antoine Pitrou244651a2009-05-04 18:56:13 +00004550 if (inShift) { /* in a base-64 section */
4551 if (IS_BASE64(ch)) { /* consume a base-64 character */
4552 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4553 base64bits += 6;
4554 s++;
4555 if (base64bits >= 16) {
4556 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004557 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004558 base64bits -= 16;
4559 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004560 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 if (surrogate) {
4562 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004563 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4564 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004565 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004566 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004568 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569 }
4570 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004571 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004572 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 }
4575 }
Victor Stinner551ac952011-11-29 22:58:13 +01004576 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004577 /* first surrogate */
4578 surrogate = outCh;
4579 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004581 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004582 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 }
4584 }
4585 }
4586 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004587 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004588 if (base64bits > 0) { /* left-over bits */
4589 if (base64bits >= 6) {
4590 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004591 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592 errmsg = "partial character in shift sequence";
4593 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004594 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004595 else {
4596 /* Some bits remain; they should be zero */
4597 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004598 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004599 errmsg = "non-zero padding bits in shift sequence";
4600 goto utf7Error;
4601 }
4602 }
4603 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004604 if (surrogate && DECODE_DIRECT(ch)) {
4605 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4606 goto onError;
4607 }
4608 surrogate = 0;
4609 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004610 /* '-' is absorbed; other terminating
4611 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004612 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004613 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004614 }
4615 }
4616 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 s++; /* consume '+' */
4619 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004620 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004621 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004622 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004623 }
4624 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004625 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004626 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004627 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004628 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004629 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004630 }
4631 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004632 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004633 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004634 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004635 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004636 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004637 else {
4638 startinpos = s-starts;
4639 s++;
4640 errmsg = "unexpected special character";
4641 goto utf7Error;
4642 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004643 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004644utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004646 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004647 errors, &errorHandler,
4648 "utf7", errmsg,
4649 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004650 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004651 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004652 }
4653
Antoine Pitrou244651a2009-05-04 18:56:13 +00004654 /* end of string */
4655
4656 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4657 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004658 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004659 if (surrogate ||
4660 (base64bits >= 6) ||
4661 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004662 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004663 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664 errors, &errorHandler,
4665 "utf7", "unterminated shift sequence",
4666 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004667 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004668 goto onError;
4669 if (s < e)
4670 goto restart;
4671 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004672 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673
4674 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004675 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004676 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004677 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004678 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004679 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004680 writer.kind, writer.data, shiftOutStart);
4681 Py_XDECREF(errorHandler);
4682 Py_XDECREF(exc);
4683 _PyUnicodeWriter_Dealloc(&writer);
4684 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004685 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004686 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004687 }
4688 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004689 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004690 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004691 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004692
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 Py_XDECREF(errorHandler);
4694 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004695 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004696
Benjamin Peterson29060642009-01-31 22:14:21 +00004697 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 Py_XDECREF(errorHandler);
4699 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004700 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004701 return NULL;
4702}
4703
4704
Alexander Belopolsky40018472011-02-26 01:02:56 +00004705PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004706_PyUnicode_EncodeUTF7(PyObject *str,
4707 int base64SetO,
4708 int base64WhiteSpace,
4709 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004710{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004711 int kind;
4712 void *data;
4713 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004714 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004715 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004716 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004717 unsigned int base64bits = 0;
4718 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004719 char * out;
4720 char * start;
4721
Benjamin Petersonbac79492012-01-14 13:34:47 -05004722 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004723 return NULL;
4724 kind = PyUnicode_KIND(str);
4725 data = PyUnicode_DATA(str);
4726 len = PyUnicode_GET_LENGTH(str);
4727
4728 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004730
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004731 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004732 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004733 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004734 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735 if (v == NULL)
4736 return NULL;
4737
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004738 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004739 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004740 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004741
Antoine Pitrou244651a2009-05-04 18:56:13 +00004742 if (inShift) {
4743 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4744 /* shifting out */
4745 if (base64bits) { /* output remaining bits */
4746 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4747 base64buffer = 0;
4748 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004749 }
4750 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004751 /* Characters not in the BASE64 set implicitly unshift the sequence
4752 so no '-' is required, except if the character is itself a '-' */
4753 if (IS_BASE64(ch) || ch == '-') {
4754 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004755 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004756 *out++ = (char) ch;
4757 }
4758 else {
4759 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004760 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004761 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004762 else { /* not in a shift sequence */
4763 if (ch == '+') {
4764 *out++ = '+';
4765 *out++ = '-';
4766 }
4767 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4768 *out++ = (char) ch;
4769 }
4770 else {
4771 *out++ = '+';
4772 inShift = 1;
4773 goto encode_char;
4774 }
4775 }
4776 continue;
4777encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004778 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004779 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004780
Antoine Pitrou244651a2009-05-04 18:56:13 +00004781 /* code first surrogate */
4782 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004783 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004784 while (base64bits >= 6) {
4785 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4786 base64bits -= 6;
4787 }
4788 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004789 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004790 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004791 base64bits += 16;
4792 base64buffer = (base64buffer << 16) | ch;
4793 while (base64bits >= 6) {
4794 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4795 base64bits -= 6;
4796 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004797 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004798 if (base64bits)
4799 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4800 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004801 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004802 if (_PyBytes_Resize(&v, out - start) < 0)
4803 return NULL;
4804 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004805}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004806PyObject *
4807PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4808 Py_ssize_t size,
4809 int base64SetO,
4810 int base64WhiteSpace,
4811 const char *errors)
4812{
4813 PyObject *result;
4814 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4815 if (tmp == NULL)
4816 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004817 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004818 base64WhiteSpace, errors);
4819 Py_DECREF(tmp);
4820 return result;
4821}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004822
Antoine Pitrou244651a2009-05-04 18:56:13 +00004823#undef IS_BASE64
4824#undef FROM_BASE64
4825#undef TO_BASE64
4826#undef DECODE_DIRECT
4827#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004828
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829/* --- UTF-8 Codec -------------------------------------------------------- */
4830
Alexander Belopolsky40018472011-02-26 01:02:56 +00004831PyObject *
4832PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004833 Py_ssize_t size,
4834 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835{
Walter Dörwald69652032004-09-07 20:24:22 +00004836 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4837}
4838
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004839#include "stringlib/asciilib.h"
4840#include "stringlib/codecs.h"
4841#include "stringlib/undef.h"
4842
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004843#include "stringlib/ucs1lib.h"
4844#include "stringlib/codecs.h"
4845#include "stringlib/undef.h"
4846
4847#include "stringlib/ucs2lib.h"
4848#include "stringlib/codecs.h"
4849#include "stringlib/undef.h"
4850
4851#include "stringlib/ucs4lib.h"
4852#include "stringlib/codecs.h"
4853#include "stringlib/undef.h"
4854
Antoine Pitrouab868312009-01-10 15:40:25 +00004855/* Mask to quickly check whether a C 'long' contains a
4856 non-ASCII, UTF8-encoded char. */
4857#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004858# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004859#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004860# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004861#else
4862# error C 'long' size should be either 4 or 8!
4863#endif
4864
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004865static Py_ssize_t
4866ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004867{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004868 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004869 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004870
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004871 /*
4872 * Issue #17237: m68k is a bit different from most architectures in
4873 * that objects do not use "natural alignment" - for example, int and
4874 * long are only aligned at 2-byte boundaries. Therefore the assert()
4875 * won't work; also, tests have shown that skipping the "optimised
4876 * version" will even speed up m68k.
4877 */
4878#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004879#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004880 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4881 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882 /* Fast path, see in STRINGLIB(utf8_decode) for
4883 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004884 /* Help allocation */
4885 const char *_p = p;
4886 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 while (_p < aligned_end) {
4888 unsigned long value = *(const unsigned long *) _p;
4889 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004890 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004891 *((unsigned long *)q) = value;
4892 _p += SIZEOF_LONG;
4893 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004894 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004895 p = _p;
4896 while (p < end) {
4897 if ((unsigned char)*p & 0x80)
4898 break;
4899 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004901 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004903#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004904#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 while (p < end) {
4906 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4907 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004908 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004909 /* Help allocation */
4910 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911 while (_p < aligned_end) {
4912 unsigned long value = *(unsigned long *) _p;
4913 if (value & ASCII_CHAR_MASK)
4914 break;
4915 _p += SIZEOF_LONG;
4916 }
4917 p = _p;
4918 if (_p == end)
4919 break;
4920 }
4921 if ((unsigned char)*p & 0x80)
4922 break;
4923 ++p;
4924 }
4925 memcpy(dest, start, p - start);
4926 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927}
Antoine Pitrouab868312009-01-10 15:40:25 +00004928
Victor Stinner785938e2011-12-11 20:09:03 +01004929PyObject *
4930PyUnicode_DecodeUTF8Stateful(const char *s,
4931 Py_ssize_t size,
4932 const char *errors,
4933 Py_ssize_t *consumed)
4934{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004935 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004936 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004937 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004938
4939 Py_ssize_t startinpos;
4940 Py_ssize_t endinpos;
4941 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004942 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004943 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004944 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004945
4946 if (size == 0) {
4947 if (consumed)
4948 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004949 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004950 }
4951
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004952 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4953 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004954 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004955 *consumed = 1;
4956 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004957 }
4958
Victor Stinner8f674cc2013-04-17 23:02:17 +02004959 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004960 writer.min_length = size;
4961 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004962 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004963
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004964 writer.pos = ascii_decode(s, end, writer.data);
4965 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004966 while (s < end) {
4967 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004968 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004969
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004970 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004971 if (PyUnicode_IS_ASCII(writer.buffer))
4972 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004973 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004974 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004975 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004976 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004977 } else {
4978 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004979 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004980 }
4981
4982 switch (ch) {
4983 case 0:
4984 if (s == end || consumed)
4985 goto End;
4986 errmsg = "unexpected end of data";
4987 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004988 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 break;
4990 case 1:
4991 errmsg = "invalid start byte";
4992 startinpos = s - starts;
4993 endinpos = startinpos + 1;
4994 break;
4995 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004996 case 3:
4997 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004998 errmsg = "invalid continuation byte";
4999 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005000 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 break;
5002 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005003 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005004 goto onError;
5005 continue;
5006 }
5007
Victor Stinner1d65d912015-10-05 13:43:50 +02005008 if (error_handler == _Py_ERROR_UNKNOWN)
5009 error_handler = get_error_handler(errors);
5010
5011 switch (error_handler) {
5012 case _Py_ERROR_IGNORE:
5013 s += (endinpos - startinpos);
5014 break;
5015
5016 case _Py_ERROR_REPLACE:
5017 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5018 goto onError;
5019 s += (endinpos - startinpos);
5020 break;
5021
5022 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005023 {
5024 Py_ssize_t i;
5025
Victor Stinner1d65d912015-10-05 13:43:50 +02005026 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5027 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005028 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005029 ch = (Py_UCS4)(unsigned char)(starts[i]);
5030 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5031 ch + 0xdc00);
5032 writer.pos++;
5033 }
5034 s += (endinpos - startinpos);
5035 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005036 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005037
5038 default:
5039 if (unicode_decode_call_errorhandler_writer(
5040 errors, &error_handler_obj,
5041 "utf-8", errmsg,
5042 &starts, &end, &startinpos, &endinpos, &exc, &s,
5043 &writer))
5044 goto onError;
5045 }
Victor Stinner785938e2011-12-11 20:09:03 +01005046 }
5047
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005048End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005049 if (consumed)
5050 *consumed = s - starts;
5051
Victor Stinner1d65d912015-10-05 13:43:50 +02005052 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005053 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005054 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005055
5056onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005057 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005058 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005059 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005060 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005061}
5062
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005063#ifdef __APPLE__
5064
5065/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005066 used to decode the command line arguments on Mac OS X.
5067
5068 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005069 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005070
5071wchar_t*
5072_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5073{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005074 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005075 wchar_t *unicode;
5076 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005077
5078 /* Note: size will always be longer than the resulting Unicode
5079 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005080 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005081 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005082 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005083 if (!unicode)
5084 return NULL;
5085
5086 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005087 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005088 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005089 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005090 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005091#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005092 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005093#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005094 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005095#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005096 if (ch > 0xFF) {
5097#if SIZEOF_WCHAR_T == 4
5098 assert(0);
5099#else
5100 assert(Py_UNICODE_IS_SURROGATE(ch));
5101 /* compute and append the two surrogates: */
5102 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5103 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5104#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005105 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005106 else {
5107 if (!ch && s == e)
5108 break;
5109 /* surrogateescape */
5110 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5111 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005112 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005113 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005114 return unicode;
5115}
5116
5117#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005119/* Primary internal function which creates utf8 encoded bytes objects.
5120
5121 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005122 and allocate exactly as much space needed at the end. Else allocate the
5123 maximum possible needed (4 result bytes per Unicode character), and return
5124 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005125*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005126PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005127_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128{
Victor Stinner6099a032011-12-18 14:22:26 +01005129 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005130 void *data;
5131 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005132
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005133 if (!PyUnicode_Check(unicode)) {
5134 PyErr_BadArgument();
5135 return NULL;
5136 }
5137
5138 if (PyUnicode_READY(unicode) == -1)
5139 return NULL;
5140
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005141 if (PyUnicode_UTF8(unicode))
5142 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5143 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005144
5145 kind = PyUnicode_KIND(unicode);
5146 data = PyUnicode_DATA(unicode);
5147 size = PyUnicode_GET_LENGTH(unicode);
5148
Benjamin Petersonead6b532011-12-20 17:23:42 -06005149 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005150 default:
5151 assert(0);
5152 case PyUnicode_1BYTE_KIND:
5153 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5154 assert(!PyUnicode_IS_ASCII(unicode));
5155 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5156 case PyUnicode_2BYTE_KIND:
5157 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5158 case PyUnicode_4BYTE_KIND:
5159 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005160 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161}
5162
Alexander Belopolsky40018472011-02-26 01:02:56 +00005163PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005164PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5165 Py_ssize_t size,
5166 const char *errors)
5167{
5168 PyObject *v, *unicode;
5169
5170 unicode = PyUnicode_FromUnicode(s, size);
5171 if (unicode == NULL)
5172 return NULL;
5173 v = _PyUnicode_AsUTF8String(unicode, errors);
5174 Py_DECREF(unicode);
5175 return v;
5176}
5177
5178PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005179PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005181 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182}
5183
Walter Dörwald41980ca2007-08-16 21:55:45 +00005184/* --- UTF-32 Codec ------------------------------------------------------- */
5185
5186PyObject *
5187PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 Py_ssize_t size,
5189 const char *errors,
5190 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005191{
5192 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5193}
5194
5195PyObject *
5196PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 Py_ssize_t size,
5198 const char *errors,
5199 int *byteorder,
5200 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005201{
5202 const char *starts = s;
5203 Py_ssize_t startinpos;
5204 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005205 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005206 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005207 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005208 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005209 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005210 PyObject *errorHandler = NULL;
5211 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005212
Walter Dörwald41980ca2007-08-16 21:55:45 +00005213 q = (unsigned char *)s;
5214 e = q + size;
5215
5216 if (byteorder)
5217 bo = *byteorder;
5218
5219 /* Check for BOM marks (U+FEFF) in the input and adjust current
5220 byte order setting accordingly. In native mode, the leading BOM
5221 mark is skipped, in all other modes, it is copied to the output
5222 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005223 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005224 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005225 if (bom == 0x0000FEFF) {
5226 bo = -1;
5227 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005229 else if (bom == 0xFFFE0000) {
5230 bo = 1;
5231 q += 4;
5232 }
5233 if (byteorder)
5234 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005235 }
5236
Victor Stinnere64322e2012-10-30 23:12:47 +01005237 if (q == e) {
5238 if (consumed)
5239 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005240 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005241 }
5242
Victor Stinnere64322e2012-10-30 23:12:47 +01005243#ifdef WORDS_BIGENDIAN
5244 le = bo < 0;
5245#else
5246 le = bo <= 0;
5247#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005248 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005249
Victor Stinner8f674cc2013-04-17 23:02:17 +02005250 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005251 writer.min_length = (e - q + 3) / 4;
5252 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005253 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005254
Victor Stinnere64322e2012-10-30 23:12:47 +01005255 while (1) {
5256 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005257 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005258
Victor Stinnere64322e2012-10-30 23:12:47 +01005259 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005260 enum PyUnicode_Kind kind = writer.kind;
5261 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005262 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005263 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005264 if (le) {
5265 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005266 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005267 if (ch > maxch)
5268 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005269 if (kind != PyUnicode_1BYTE_KIND &&
5270 Py_UNICODE_IS_SURROGATE(ch))
5271 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005272 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005273 q += 4;
5274 } while (q <= last);
5275 }
5276 else {
5277 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005278 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005279 if (ch > maxch)
5280 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005281 if (kind != PyUnicode_1BYTE_KIND &&
5282 Py_UNICODE_IS_SURROGATE(ch))
5283 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005284 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005285 q += 4;
5286 } while (q <= last);
5287 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005288 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005289 }
5290
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005291 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005292 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005293 startinpos = ((const char *)q) - starts;
5294 endinpos = startinpos + 4;
5295 }
5296 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005297 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005298 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005299 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005301 startinpos = ((const char *)q) - starts;
5302 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005303 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005304 else {
5305 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005306 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005307 goto onError;
5308 q += 4;
5309 continue;
5310 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005311 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005312 startinpos = ((const char *)q) - starts;
5313 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005314 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005315
5316 /* The remaining input chars are ignored if the callback
5317 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005318 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005319 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005320 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005321 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005322 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005324 }
5325
Walter Dörwald41980ca2007-08-16 21:55:45 +00005326 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005327 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005328
Walter Dörwald41980ca2007-08-16 21:55:45 +00005329 Py_XDECREF(errorHandler);
5330 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005331 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005332
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005334 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005335 Py_XDECREF(errorHandler);
5336 Py_XDECREF(exc);
5337 return NULL;
5338}
5339
5340PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005341_PyUnicode_EncodeUTF32(PyObject *str,
5342 const char *errors,
5343 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005344{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005345 enum PyUnicode_Kind kind;
5346 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005347 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005348 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005349 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005350#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005351 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005352#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005353 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005354#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005355 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005356 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005357 PyObject *errorHandler = NULL;
5358 PyObject *exc = NULL;
5359 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005360
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005361 if (!PyUnicode_Check(str)) {
5362 PyErr_BadArgument();
5363 return NULL;
5364 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005365 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005366 return NULL;
5367 kind = PyUnicode_KIND(str);
5368 data = PyUnicode_DATA(str);
5369 len = PyUnicode_GET_LENGTH(str);
5370
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005371 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005372 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005373 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005374 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005375 if (v == NULL)
5376 return NULL;
5377
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005378 /* output buffer is 4-bytes aligned */
5379 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005380 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005381 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005382 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005383 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005384 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005385
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005386 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005387 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005388 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005389 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005390 else
5391 encoding = "utf-32";
5392
5393 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005394 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5395 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005396 }
5397
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005398 pos = 0;
5399 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005400 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005401
5402 if (kind == PyUnicode_2BYTE_KIND) {
5403 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5404 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005405 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005406 else {
5407 assert(kind == PyUnicode_4BYTE_KIND);
5408 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5409 &out, native_ordering);
5410 }
5411 if (pos == len)
5412 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005413
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005414 rep = unicode_encode_call_errorhandler(
5415 errors, &errorHandler,
5416 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005417 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005418 if (!rep)
5419 goto error;
5420
5421 if (PyBytes_Check(rep)) {
5422 repsize = PyBytes_GET_SIZE(rep);
5423 if (repsize & 3) {
5424 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005425 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005426 "surrogates not allowed");
5427 goto error;
5428 }
5429 moreunits = repsize / 4;
5430 }
5431 else {
5432 assert(PyUnicode_Check(rep));
5433 if (PyUnicode_READY(rep) < 0)
5434 goto error;
5435 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5436 if (!PyUnicode_IS_ASCII(rep)) {
5437 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005438 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005439 "surrogates not allowed");
5440 goto error;
5441 }
5442 }
5443
5444 /* four bytes are reserved for each surrogate */
5445 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005446 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005447 Py_ssize_t morebytes = 4 * (moreunits - 1);
5448 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5449 /* integer overflow */
5450 PyErr_NoMemory();
5451 goto error;
5452 }
5453 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5454 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005455 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005456 }
5457
5458 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005459 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005460 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005461 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005462 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005463 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5464 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005465 }
5466
5467 Py_CLEAR(rep);
5468 }
5469
5470 /* Cut back to size actually needed. This is necessary for, for example,
5471 encoding of a string containing isolated surrogates and the 'ignore'
5472 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005473 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005474 if (nsize != PyBytes_GET_SIZE(v))
5475 _PyBytes_Resize(&v, nsize);
5476 Py_XDECREF(errorHandler);
5477 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005478 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005479 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005480 error:
5481 Py_XDECREF(rep);
5482 Py_XDECREF(errorHandler);
5483 Py_XDECREF(exc);
5484 Py_XDECREF(v);
5485 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005486}
5487
Alexander Belopolsky40018472011-02-26 01:02:56 +00005488PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005489PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5490 Py_ssize_t size,
5491 const char *errors,
5492 int byteorder)
5493{
5494 PyObject *result;
5495 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5496 if (tmp == NULL)
5497 return NULL;
5498 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5499 Py_DECREF(tmp);
5500 return result;
5501}
5502
5503PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005504PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005505{
Victor Stinnerb960b342011-11-20 19:12:52 +01005506 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005507}
5508
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509/* --- UTF-16 Codec ------------------------------------------------------- */
5510
Tim Peters772747b2001-08-09 22:21:55 +00005511PyObject *
5512PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 Py_ssize_t size,
5514 const char *errors,
5515 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516{
Walter Dörwald69652032004-09-07 20:24:22 +00005517 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5518}
5519
5520PyObject *
5521PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005522 Py_ssize_t size,
5523 const char *errors,
5524 int *byteorder,
5525 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005526{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005527 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005528 Py_ssize_t startinpos;
5529 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005530 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005531 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005532 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005533 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005534 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005535 PyObject *errorHandler = NULL;
5536 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005537 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538
Tim Peters772747b2001-08-09 22:21:55 +00005539 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005540 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541
5542 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005543 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005545 /* Check for BOM marks (U+FEFF) in the input and adjust current
5546 byte order setting accordingly. In native mode, the leading BOM
5547 mark is skipped, in all other modes, it is copied to the output
5548 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005549 if (bo == 0 && size >= 2) {
5550 const Py_UCS4 bom = (q[1] << 8) | q[0];
5551 if (bom == 0xFEFF) {
5552 q += 2;
5553 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005554 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005555 else if (bom == 0xFFFE) {
5556 q += 2;
5557 bo = 1;
5558 }
5559 if (byteorder)
5560 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562
Antoine Pitrou63065d72012-05-15 23:48:04 +02005563 if (q == e) {
5564 if (consumed)
5565 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005566 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005567 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005568
Christian Heimes743e0cd2012-10-17 23:52:17 +02005569#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005570 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005571 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005572#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005573 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005574 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005575#endif
Tim Peters772747b2001-08-09 22:21:55 +00005576
Antoine Pitrou63065d72012-05-15 23:48:04 +02005577 /* Note: size will always be longer than the resulting Unicode
5578 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005579 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005580 writer.min_length = (e - q + 1) / 2;
5581 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005582 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005583
Antoine Pitrou63065d72012-05-15 23:48:04 +02005584 while (1) {
5585 Py_UCS4 ch = 0;
5586 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005587 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005588 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005589 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005590 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005591 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005592 native_ordering);
5593 else
5594 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005595 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005596 native_ordering);
5597 } else if (kind == PyUnicode_2BYTE_KIND) {
5598 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005599 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005600 native_ordering);
5601 } else {
5602 assert(kind == PyUnicode_4BYTE_KIND);
5603 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005604 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005605 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005606 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005607 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005608
Antoine Pitrou63065d72012-05-15 23:48:04 +02005609 switch (ch)
5610 {
5611 case 0:
5612 /* remaining byte at the end? (size should be even) */
5613 if (q == e || consumed)
5614 goto End;
5615 errmsg = "truncated data";
5616 startinpos = ((const char *)q) - starts;
5617 endinpos = ((const char *)e) - starts;
5618 break;
5619 /* The remaining input chars are ignored if the callback
5620 chooses to skip the input */
5621 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005622 q -= 2;
5623 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005624 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005625 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005626 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005627 endinpos = ((const char *)e) - starts;
5628 break;
5629 case 2:
5630 errmsg = "illegal encoding";
5631 startinpos = ((const char *)q) - 2 - starts;
5632 endinpos = startinpos + 2;
5633 break;
5634 case 3:
5635 errmsg = "illegal UTF-16 surrogate";
5636 startinpos = ((const char *)q) - 4 - starts;
5637 endinpos = startinpos + 2;
5638 break;
5639 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005640 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005641 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005642 continue;
5643 }
5644
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005645 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005646 errors,
5647 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005648 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005649 &starts,
5650 (const char **)&e,
5651 &startinpos,
5652 &endinpos,
5653 &exc,
5654 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005655 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 }
5658
Antoine Pitrou63065d72012-05-15 23:48:04 +02005659End:
Walter Dörwald69652032004-09-07 20:24:22 +00005660 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005662
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005663 Py_XDECREF(errorHandler);
5664 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005665 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666
Benjamin Peterson29060642009-01-31 22:14:21 +00005667 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005668 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005669 Py_XDECREF(errorHandler);
5670 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 return NULL;
5672}
5673
Tim Peters772747b2001-08-09 22:21:55 +00005674PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005675_PyUnicode_EncodeUTF16(PyObject *str,
5676 const char *errors,
5677 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005679 enum PyUnicode_Kind kind;
5680 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005681 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005682 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005683 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005684 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005685#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005686 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005687#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005688 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005689#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005690 const char *encoding;
5691 Py_ssize_t nsize, pos;
5692 PyObject *errorHandler = NULL;
5693 PyObject *exc = NULL;
5694 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005695
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005696 if (!PyUnicode_Check(str)) {
5697 PyErr_BadArgument();
5698 return NULL;
5699 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005700 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005701 return NULL;
5702 kind = PyUnicode_KIND(str);
5703 data = PyUnicode_DATA(str);
5704 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005705
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005706 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005707 if (kind == PyUnicode_4BYTE_KIND) {
5708 const Py_UCS4 *in = (const Py_UCS4 *)data;
5709 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005710 while (in < end) {
5711 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005712 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005713 }
5714 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005715 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005716 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005718 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005719 nsize = len + pairs + (byteorder == 0);
5720 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005721 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005725 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005726 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005727 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005728 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005729 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005730 }
5731 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005732 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005733 }
Tim Peters772747b2001-08-09 22:21:55 +00005734
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005735 if (kind == PyUnicode_1BYTE_KIND) {
5736 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5737 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005738 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005739
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005740 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005741 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005742 }
5743 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005744 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005745 }
5746 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005747 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005748 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005749
5750 pos = 0;
5751 while (pos < len) {
5752 Py_ssize_t repsize, moreunits;
5753
5754 if (kind == PyUnicode_2BYTE_KIND) {
5755 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5756 &out, native_ordering);
5757 }
5758 else {
5759 assert(kind == PyUnicode_4BYTE_KIND);
5760 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5761 &out, native_ordering);
5762 }
5763 if (pos == len)
5764 break;
5765
5766 rep = unicode_encode_call_errorhandler(
5767 errors, &errorHandler,
5768 encoding, "surrogates not allowed",
5769 str, &exc, pos, pos + 1, &pos);
5770 if (!rep)
5771 goto error;
5772
5773 if (PyBytes_Check(rep)) {
5774 repsize = PyBytes_GET_SIZE(rep);
5775 if (repsize & 1) {
5776 raise_encode_exception(&exc, encoding,
5777 str, pos - 1, pos,
5778 "surrogates not allowed");
5779 goto error;
5780 }
5781 moreunits = repsize / 2;
5782 }
5783 else {
5784 assert(PyUnicode_Check(rep));
5785 if (PyUnicode_READY(rep) < 0)
5786 goto error;
5787 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5788 if (!PyUnicode_IS_ASCII(rep)) {
5789 raise_encode_exception(&exc, encoding,
5790 str, pos - 1, pos,
5791 "surrogates not allowed");
5792 goto error;
5793 }
5794 }
5795
5796 /* two bytes are reserved for each surrogate */
5797 if (moreunits > 1) {
5798 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5799 Py_ssize_t morebytes = 2 * (moreunits - 1);
5800 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5801 /* integer overflow */
5802 PyErr_NoMemory();
5803 goto error;
5804 }
5805 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5806 goto error;
5807 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5808 }
5809
5810 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005811 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005812 out += moreunits;
5813 } else /* rep is unicode */ {
5814 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5815 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5816 &out, native_ordering);
5817 }
5818
5819 Py_CLEAR(rep);
5820 }
5821
5822 /* Cut back to size actually needed. This is necessary for, for example,
5823 encoding of a string containing isolated surrogates and the 'ignore' handler
5824 is used. */
5825 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5826 if (nsize != PyBytes_GET_SIZE(v))
5827 _PyBytes_Resize(&v, nsize);
5828 Py_XDECREF(errorHandler);
5829 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005830 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005831 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005832 error:
5833 Py_XDECREF(rep);
5834 Py_XDECREF(errorHandler);
5835 Py_XDECREF(exc);
5836 Py_XDECREF(v);
5837 return NULL;
5838#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839}
5840
Alexander Belopolsky40018472011-02-26 01:02:56 +00005841PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005842PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5843 Py_ssize_t size,
5844 const char *errors,
5845 int byteorder)
5846{
5847 PyObject *result;
5848 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5849 if (tmp == NULL)
5850 return NULL;
5851 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5852 Py_DECREF(tmp);
5853 return result;
5854}
5855
5856PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005857PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005859 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860}
5861
5862/* --- Unicode Escape Codec ----------------------------------------------- */
5863
Fredrik Lundh06d12682001-01-24 07:59:11 +00005864static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005865
Alexander Belopolsky40018472011-02-26 01:02:56 +00005866PyObject *
5867PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005868 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005869 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005871 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005872 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005874 PyObject *errorHandler = NULL;
5875 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005876
Victor Stinner62ec3312016-09-06 17:04:34 -07005877 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005878 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005879 }
5880 /* Escaped strings will always be longer than the resulting
5881 Unicode string, so we start with size here and then reduce the
5882 length after conversion to the true value.
5883 (but if the error callback returns a long replacement string
5884 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005885 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005886 writer.min_length = size;
5887 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5888 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005889 }
5890
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 end = s + size;
5892 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005893 unsigned char c = (unsigned char) *s++;
5894 Py_UCS4 ch;
5895 int count;
5896 Py_ssize_t startinpos;
5897 Py_ssize_t endinpos;
5898 const char *message;
5899
5900#define WRITE_ASCII_CHAR(ch) \
5901 do { \
5902 assert(ch <= 127); \
5903 assert(writer.pos < writer.size); \
5904 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5905 } while(0)
5906
5907#define WRITE_CHAR(ch) \
5908 do { \
5909 if (ch <= writer.maxchar) { \
5910 assert(writer.pos < writer.size); \
5911 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5912 } \
5913 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5914 goto onError; \
5915 } \
5916 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917
5918 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005919 if (c != '\\') {
5920 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921 continue;
5922 }
5923
Victor Stinner62ec3312016-09-06 17:04:34 -07005924 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005926 if (s >= end) {
5927 message = "\\ at end of string";
5928 goto error;
5929 }
5930 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005931
Victor Stinner62ec3312016-09-06 17:04:34 -07005932 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005933 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005936 case '\n': continue;
5937 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5938 case '\'': WRITE_ASCII_CHAR('\''); continue;
5939 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5940 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005941 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005942 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5943 case 't': WRITE_ASCII_CHAR('\t'); continue;
5944 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5945 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005946 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005947 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005948 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005949 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 case '0': case '1': case '2': case '3':
5953 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005954 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005955 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005956 ch = (ch<<3) + *s++ - '0';
5957 if (s < end && '0' <= *s && *s <= '7') {
5958 ch = (ch<<3) + *s++ - '0';
5959 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005961 WRITE_CHAR(ch);
5962 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963
Benjamin Peterson29060642009-01-31 22:14:21 +00005964 /* hex escapes */
5965 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07005967 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005968 message = "truncated \\xXX escape";
5969 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07005973 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005974 message = "truncated \\uXXXX escape";
5975 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005978 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07005979 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005980 message = "truncated \\UXXXXXXXX escape";
5981 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07005982 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02005983 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07005984 ch <<= 4;
5985 if (c >= '0' && c <= '9') {
5986 ch += c - '0';
5987 }
5988 else if (c >= 'a' && c <= 'f') {
5989 ch += c - ('a' - 10);
5990 }
5991 else if (c >= 'A' && c <= 'F') {
5992 ch += c - ('A' - 10);
5993 }
5994 else {
5995 break;
5996 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00005997 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005998 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02005999 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006000 }
6001
6002 /* when we get here, ch is a 32-bit unicode character */
6003 if (ch > MAX_UNICODE) {
6004 message = "illegal Unicode character";
6005 goto error;
6006 }
6007
6008 WRITE_CHAR(ch);
6009 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006010
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006012 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006013 if (ucnhash_CAPI == NULL) {
6014 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006015 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6016 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006017 if (ucnhash_CAPI == NULL) {
6018 PyErr_SetString(
6019 PyExc_UnicodeError,
6020 "\\N escapes not supported (can't load unicodedata module)"
6021 );
6022 goto onError;
6023 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006024 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006025
6026 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006027 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006028 const char *start = ++s;
6029 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006030 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006031 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006032 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006033 namelen = s - start;
6034 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006035 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006036 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006037 ch = 0xffffffff; /* in case 'getcode' messes up */
6038 if (namelen <= INT_MAX &&
6039 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6040 &ch, 0)) {
6041 assert(ch <= MAX_UNICODE);
6042 WRITE_CHAR(ch);
6043 continue;
6044 }
6045 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006046 }
6047 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006048 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006049
6050 default:
R David Murray110b6fe2016-09-08 15:34:08 -04006051 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6052 "invalid escape sequence '\\%c'", c) < 0)
6053 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006054 WRITE_ASCII_CHAR('\\');
6055 WRITE_CHAR(c);
6056 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006058
6059 error:
6060 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006061 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006062 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006063 errors, &errorHandler,
6064 "unicodeescape", message,
6065 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006066 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006067 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006068 }
6069 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6070 goto onError;
6071 }
6072
6073#undef WRITE_ASCII_CHAR
6074#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006076
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006077 Py_XDECREF(errorHandler);
6078 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006079 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006080
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006082 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006083 Py_XDECREF(errorHandler);
6084 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 return NULL;
6086}
6087
6088/* Return a Unicode-Escape string version of the Unicode object.
6089
6090 If quotes is true, the string is enclosed in u"" or u'' quotes as
6091 appropriate.
6092
6093*/
6094
Alexander Belopolsky40018472011-02-26 01:02:56 +00006095PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006096PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006098 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006099 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006101 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006102 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006103 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104
Ezio Melottie7f90372012-10-05 03:33:31 +03006105 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006106 escape.
6107
Ezio Melottie7f90372012-10-05 03:33:31 +03006108 For UCS1 strings it's '\xxx', 4 bytes per source character.
6109 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6110 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006111 */
6112
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006113 if (!PyUnicode_Check(unicode)) {
6114 PyErr_BadArgument();
6115 return NULL;
6116 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006117 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006118 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006119 }
Victor Stinner358af132015-10-12 22:36:57 +02006120
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006121 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006122 if (len == 0) {
6123 return PyBytes_FromStringAndSize(NULL, 0);
6124 }
6125
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006126 kind = PyUnicode_KIND(unicode);
6127 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006128 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6129 bytes, and 1 byte characters 4. */
6130 expandsize = kind * 2 + 2;
6131 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) {
6132 return PyErr_NoMemory();
6133 }
6134 repr = PyBytes_FromStringAndSize(NULL, 2 + expandsize * len + 1);
6135 if (repr == NULL) {
6136 return NULL;
6137 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006138
Victor Stinner62ec3312016-09-06 17:04:34 -07006139 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006140 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006141 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006142
Victor Stinner62ec3312016-09-06 17:04:34 -07006143 /* U+0000-U+00ff range */
6144 if (ch < 0x100) {
6145 if (ch >= ' ' && ch < 127) {
6146 if (ch != '\\') {
6147 /* Copy printable US ASCII as-is */
6148 *p++ = (char) ch;
6149 }
6150 /* Escape backslashes */
6151 else {
6152 *p++ = '\\';
6153 *p++ = '\\';
6154 }
6155 }
Victor Stinner358af132015-10-12 22:36:57 +02006156
Victor Stinner62ec3312016-09-06 17:04:34 -07006157 /* Map special whitespace to '\t', \n', '\r' */
6158 else if (ch == '\t') {
6159 *p++ = '\\';
6160 *p++ = 't';
6161 }
6162 else if (ch == '\n') {
6163 *p++ = '\\';
6164 *p++ = 'n';
6165 }
6166 else if (ch == '\r') {
6167 *p++ = '\\';
6168 *p++ = 'r';
6169 }
6170
6171 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6172 else {
6173 *p++ = '\\';
6174 *p++ = 'x';
6175 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6176 *p++ = Py_hexdigits[ch & 0x000F];
6177 }
Tim Petersced69f82003-09-16 20:30:58 +00006178 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006179 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6180 else if (ch < 0x10000) {
6181 /* U+0100-U+ffff */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 *p++ = '\\';
6183 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006184 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6185 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6186 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6187 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006189 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6190 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006191
Victor Stinner62ec3312016-09-06 17:04:34 -07006192 /* Make sure that the first two digits are zero */
6193 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006194 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006195 *p++ = 'U';
6196 *p++ = '0';
6197 *p++ = '0';
6198 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6199 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6200 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6201 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6202 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6203 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206
Victor Stinner62ec3312016-09-06 17:04:34 -07006207 assert(p - PyBytes_AS_STRING(repr) > 0);
6208 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6209 return NULL;
6210 }
6211 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212}
6213
Alexander Belopolsky40018472011-02-26 01:02:56 +00006214PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006215PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6216 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006218 PyObject *result;
6219 PyObject *tmp = PyUnicode_FromUnicode(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006220 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006222 }
6223
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006224 result = PyUnicode_AsUnicodeEscapeString(tmp);
6225 Py_DECREF(tmp);
6226 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227}
6228
6229/* --- Raw Unicode Escape Codec ------------------------------------------- */
6230
Alexander Belopolsky40018472011-02-26 01:02:56 +00006231PyObject *
6232PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006233 Py_ssize_t size,
6234 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006236 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006237 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006239 PyObject *errorHandler = NULL;
6240 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006241
Victor Stinner62ec3312016-09-06 17:04:34 -07006242 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006243 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006244 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006245
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246 /* Escaped strings will always be longer than the resulting
6247 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006248 length after conversion to the true value. (But decoding error
6249 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006250 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006251 writer.min_length = size;
6252 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6253 goto onError;
6254 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006255
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256 end = s + size;
6257 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006258 unsigned char c = (unsigned char) *s++;
6259 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006260 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006261 Py_ssize_t startinpos;
6262 Py_ssize_t endinpos;
6263 const char *message;
6264
6265#define WRITE_CHAR(ch) \
6266 do { \
6267 if (ch <= writer.maxchar) { \
6268 assert(writer.pos < writer.size); \
6269 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6270 } \
6271 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6272 goto onError; \
6273 } \
6274 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006277 if (c != '\\' || s >= end) {
6278 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006280 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006281
Victor Stinner62ec3312016-09-06 17:04:34 -07006282 c = (unsigned char) *s++;
6283 if (c == 'u') {
6284 count = 4;
6285 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006287 else if (c == 'U') {
6288 count = 8;
6289 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006290 }
6291 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006292 assert(writer.pos < writer.size);
6293 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6294 WRITE_CHAR(c);
6295 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006296 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006297 startinpos = s - starts - 2;
6298
6299 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6300 for (ch = 0; count && s < end; ++s, --count) {
6301 c = (unsigned char)*s;
6302 ch <<= 4;
6303 if (c >= '0' && c <= '9') {
6304 ch += c - '0';
6305 }
6306 else if (c >= 'a' && c <= 'f') {
6307 ch += c - ('a' - 10);
6308 }
6309 else if (c >= 'A' && c <= 'F') {
6310 ch += c - ('A' - 10);
6311 }
6312 else {
6313 break;
6314 }
6315 }
6316 if (!count) {
6317 if (ch <= MAX_UNICODE) {
6318 WRITE_CHAR(ch);
6319 continue;
6320 }
6321 message = "\\Uxxxxxxxx out of range";
6322 }
6323
6324 endinpos = s-starts;
6325 writer.min_length = end - s + writer.pos;
6326 if (unicode_decode_call_errorhandler_writer(
6327 errors, &errorHandler,
6328 "rawunicodeescape", message,
6329 &starts, &end, &startinpos, &endinpos, &exc, &s,
6330 &writer)) {
6331 goto onError;
6332 }
6333 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6334 goto onError;
6335 }
6336
6337#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006339 Py_XDECREF(errorHandler);
6340 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006341 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006342
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006344 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345 Py_XDECREF(errorHandler);
6346 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006348
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349}
6350
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006351
Alexander Belopolsky40018472011-02-26 01:02:56 +00006352PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006353PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354{
Victor Stinner62ec3312016-09-06 17:04:34 -07006355 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006357 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006358 int kind;
6359 void *data;
6360 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006362 if (!PyUnicode_Check(unicode)) {
6363 PyErr_BadArgument();
6364 return NULL;
6365 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006366 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006367 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006368 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006369 kind = PyUnicode_KIND(unicode);
6370 data = PyUnicode_DATA(unicode);
6371 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006372 if (kind == PyUnicode_1BYTE_KIND) {
6373 return PyBytes_FromStringAndSize(data, len);
6374 }
Victor Stinner0e368262011-11-10 20:12:49 +01006375
Victor Stinner62ec3312016-09-06 17:04:34 -07006376 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6377 bytes, and 1 byte characters 4. */
6378 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006379
Victor Stinner62ec3312016-09-06 17:04:34 -07006380 if (len > PY_SSIZE_T_MAX / expandsize) {
6381 return PyErr_NoMemory();
6382 }
6383 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6384 if (repr == NULL) {
6385 return NULL;
6386 }
6387 if (len == 0) {
6388 return repr;
6389 }
6390
6391 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006392 for (pos = 0; pos < len; pos++) {
6393 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006394
Victor Stinner62ec3312016-09-06 17:04:34 -07006395 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6396 if (ch < 0x100) {
6397 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006398 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006399 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6400 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 *p++ = '\\';
6402 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006403 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6404 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6405 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6406 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006408 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6409 else {
6410 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6411 *p++ = '\\';
6412 *p++ = 'U';
6413 *p++ = '0';
6414 *p++ = '0';
6415 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6416 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6417 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6418 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6419 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6420 *p++ = Py_hexdigits[ch & 15];
6421 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006423
Victor Stinner62ec3312016-09-06 17:04:34 -07006424 assert(p > PyBytes_AS_STRING(repr));
6425 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6426 return NULL;
6427 }
6428 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429}
6430
Alexander Belopolsky40018472011-02-26 01:02:56 +00006431PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006432PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6433 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006435 PyObject *result;
6436 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6437 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006438 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006439 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6440 Py_DECREF(tmp);
6441 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442}
6443
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006444/* --- Unicode Internal Codec ------------------------------------------- */
6445
Alexander Belopolsky40018472011-02-26 01:02:56 +00006446PyObject *
6447_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006448 Py_ssize_t size,
6449 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006450{
6451 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006452 Py_ssize_t startinpos;
6453 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006454 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006455 const char *end;
6456 const char *reason;
6457 PyObject *errorHandler = NULL;
6458 PyObject *exc = NULL;
6459
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006460 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006461 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006462 1))
6463 return NULL;
6464
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006465 if (size == 0)
6466 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006467
Victor Stinner8f674cc2013-04-17 23:02:17 +02006468 _PyUnicodeWriter_Init(&writer);
6469 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6470 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006472 }
6473 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006474
Victor Stinner8f674cc2013-04-17 23:02:17 +02006475 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006476 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006477 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006478 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006479 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006480 endinpos = end-starts;
6481 reason = "truncated input";
6482 goto error;
6483 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006484 /* We copy the raw representation one byte at a time because the
6485 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006486 ((char *) &uch)[0] = s[0];
6487 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006488#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006489 ((char *) &uch)[2] = s[2];
6490 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006491#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006492 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006493#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006494 /* We have to sanity check the raw data, otherwise doom looms for
6495 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006496 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006497 endinpos = s - starts + Py_UNICODE_SIZE;
6498 reason = "illegal code point (> 0x10FFFF)";
6499 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006500 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006501#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006502 s += Py_UNICODE_SIZE;
6503#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006504 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006505 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006506 Py_UNICODE uch2;
6507 ((char *) &uch2)[0] = s[0];
6508 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006509 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006510 {
Victor Stinner551ac952011-11-29 22:58:13 +01006511 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006512 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006513 }
6514 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006515#endif
6516
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006517 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006518 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006519 continue;
6520
6521 error:
6522 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006523 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006524 errors, &errorHandler,
6525 "unicode_internal", reason,
6526 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006527 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006528 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006529 }
6530
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006531 Py_XDECREF(errorHandler);
6532 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006533 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006534
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006536 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006537 Py_XDECREF(errorHandler);
6538 Py_XDECREF(exc);
6539 return NULL;
6540}
6541
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542/* --- Latin-1 Codec ------------------------------------------------------ */
6543
Alexander Belopolsky40018472011-02-26 01:02:56 +00006544PyObject *
6545PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006546 Py_ssize_t size,
6547 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006550 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551}
6552
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006553/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006554static void
6555make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006556 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006557 PyObject *unicode,
6558 Py_ssize_t startpos, Py_ssize_t endpos,
6559 const char *reason)
6560{
6561 if (*exceptionObject == NULL) {
6562 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006563 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006564 encoding, unicode, startpos, endpos, reason);
6565 }
6566 else {
6567 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6568 goto onError;
6569 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6570 goto onError;
6571 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6572 goto onError;
6573 return;
6574 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006575 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006576 }
6577}
6578
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006579/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006580static void
6581raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006582 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006583 PyObject *unicode,
6584 Py_ssize_t startpos, Py_ssize_t endpos,
6585 const char *reason)
6586{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006587 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006588 encoding, unicode, startpos, endpos, reason);
6589 if (*exceptionObject != NULL)
6590 PyCodec_StrictErrors(*exceptionObject);
6591}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006592
6593/* error handling callback helper:
6594 build arguments, call the callback and check the arguments,
6595 put the result into newpos and return the replacement string, which
6596 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006597static PyObject *
6598unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006599 PyObject **errorHandler,
6600 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006601 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006602 Py_ssize_t startpos, Py_ssize_t endpos,
6603 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006604{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006605 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006606 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006607 PyObject *restuple;
6608 PyObject *resunicode;
6609
6610 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006612 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006613 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006614 }
6615
Benjamin Petersonbac79492012-01-14 13:34:47 -05006616 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006617 return NULL;
6618 len = PyUnicode_GET_LENGTH(unicode);
6619
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006620 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006621 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006624
6625 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006626 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006627 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006629 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006630 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006631 Py_DECREF(restuple);
6632 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006633 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006634 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006635 &resunicode, newpos)) {
6636 Py_DECREF(restuple);
6637 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006638 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006639 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6640 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6641 Py_DECREF(restuple);
6642 return NULL;
6643 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006645 *newpos = len + *newpos;
6646 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006647 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006648 Py_DECREF(restuple);
6649 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006650 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651 Py_INCREF(resunicode);
6652 Py_DECREF(restuple);
6653 return resunicode;
6654}
6655
Alexander Belopolsky40018472011-02-26 01:02:56 +00006656static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006657unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006658 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006659 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006660{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006661 /* input state */
6662 Py_ssize_t pos=0, size;
6663 int kind;
6664 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665 /* pointer into the output */
6666 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006667 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6668 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006669 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006670 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006671 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006672 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006673 /* output object */
6674 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006675
Benjamin Petersonbac79492012-01-14 13:34:47 -05006676 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006677 return NULL;
6678 size = PyUnicode_GET_LENGTH(unicode);
6679 kind = PyUnicode_KIND(unicode);
6680 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006681 /* allocate enough for a simple encoding without
6682 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006683 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006684 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006685
6686 _PyBytesWriter_Init(&writer);
6687 str = _PyBytesWriter_Alloc(&writer, size);
6688 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006689 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006691 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006692 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006693
Benjamin Peterson29060642009-01-31 22:14:21 +00006694 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006695 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006697 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006698 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006699 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006700 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006701 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006702 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006703 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006704 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006706
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006707 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006709
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006710 /* Only overallocate the buffer if it's not the last write */
6711 writer.overallocate = (collend < size);
6712
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006714 if (error_handler == _Py_ERROR_UNKNOWN)
6715 error_handler = get_error_handler(errors);
6716
6717 switch (error_handler) {
6718 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006719 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006720 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006721
6722 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006723 memset(str, '?', collend - collstart);
6724 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006725 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006726 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006727 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 break;
Victor Stinner50149202015-09-22 00:26:54 +02006729
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006730 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006731 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006732 writer.min_size -= (collend - collstart);
6733 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006734 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006735 if (str == NULL)
6736 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006737 pos = collend;
6738 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006739
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006740 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006741 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006742 writer.min_size -= (collend - collstart);
6743 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006744 unicode, collstart, collend);
6745 if (str == NULL)
6746 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006747 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006748 break;
Victor Stinner50149202015-09-22 00:26:54 +02006749
Victor Stinnerc3713e92015-09-29 12:32:13 +02006750 case _Py_ERROR_SURROGATEESCAPE:
6751 for (i = collstart; i < collend; ++i) {
6752 ch = PyUnicode_READ(kind, data, i);
6753 if (ch < 0xdc80 || 0xdcff < ch) {
6754 /* Not a UTF-8b surrogate */
6755 break;
6756 }
6757 *str++ = (char)(ch - 0xdc00);
6758 ++pos;
6759 }
6760 if (i >= collend)
6761 break;
6762 collstart = pos;
6763 assert(collstart != collend);
6764 /* fallback to general error handling */
6765
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006767 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6768 encoding, reason, unicode, &exc,
6769 collstart, collend, &newpos);
6770 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006772
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006773 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006774 writer.min_size -= 1;
6775
Victor Stinner6bd525b2015-10-09 13:10:05 +02006776 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006777 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006778 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006779 PyBytes_AS_STRING(rep),
6780 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006781 if (str == NULL)
6782 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006783 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006784 else {
6785 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006786
Victor Stinner6bd525b2015-10-09 13:10:05 +02006787 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006789
6790 if (PyUnicode_IS_ASCII(rep)) {
6791 /* Fast path: all characters are smaller than limit */
6792 assert(limit >= 128);
6793 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6794 str = _PyBytesWriter_WriteBytes(&writer, str,
6795 PyUnicode_DATA(rep),
6796 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006797 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006798 else {
6799 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6800
6801 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6802 if (str == NULL)
6803 goto onError;
6804
6805 /* check if there is anything unencodable in the
6806 replacement and copy it to the output */
6807 for (i = 0; repsize-->0; ++i, ++str) {
6808 ch = PyUnicode_READ_CHAR(rep, i);
6809 if (ch >= limit) {
6810 raise_encode_exception(&exc, encoding, unicode,
6811 pos, pos+1, reason);
6812 goto onError;
6813 }
6814 *str = (char)ch;
6815 }
6816 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006817 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006818 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006819 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006820 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006821
6822 /* If overallocation was disabled, ensure that it was the last
6823 write. Otherwise, we missed an optimization */
6824 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006825 }
6826 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006827
Victor Stinner50149202015-09-22 00:26:54 +02006828 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006829 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006830 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006831
6832 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006833 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006834 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006835 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006836 Py_XDECREF(exc);
6837 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006838}
6839
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006840/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006841PyObject *
6842PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006843 Py_ssize_t size,
6844 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006846 PyObject *result;
6847 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6848 if (unicode == NULL)
6849 return NULL;
6850 result = unicode_encode_ucs1(unicode, errors, 256);
6851 Py_DECREF(unicode);
6852 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853}
6854
Alexander Belopolsky40018472011-02-26 01:02:56 +00006855PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006856_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857{
6858 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 PyErr_BadArgument();
6860 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006862 if (PyUnicode_READY(unicode) == -1)
6863 return NULL;
6864 /* Fast path: if it is a one-byte string, construct
6865 bytes object directly. */
6866 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6867 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6868 PyUnicode_GET_LENGTH(unicode));
6869 /* Non-Latin-1 characters present. Defer to above function to
6870 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006871 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006872}
6873
6874PyObject*
6875PyUnicode_AsLatin1String(PyObject *unicode)
6876{
6877 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878}
6879
6880/* --- 7-bit ASCII Codec -------------------------------------------------- */
6881
Alexander Belopolsky40018472011-02-26 01:02:56 +00006882PyObject *
6883PyUnicode_DecodeASCII(const char *s,
6884 Py_ssize_t size,
6885 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006887 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006888 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006889 int kind;
6890 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006891 Py_ssize_t startinpos;
6892 Py_ssize_t endinpos;
6893 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006894 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006895 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006896 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006897 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006898
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006900 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006901
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006903 if (size == 1 && (unsigned char)s[0] < 128)
6904 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006905
Victor Stinner8f674cc2013-04-17 23:02:17 +02006906 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006907 writer.min_length = size;
6908 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006909 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006910
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006911 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006912 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006913 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006914 writer.pos = outpos;
6915 if (writer.pos == size)
6916 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006917
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006918 s += writer.pos;
6919 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006920 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006921 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006923 PyUnicode_WRITE(kind, data, writer.pos, c);
6924 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006926 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006928
6929 /* byte outsize range 0x00..0x7f: call the error handler */
6930
6931 if (error_handler == _Py_ERROR_UNKNOWN)
6932 error_handler = get_error_handler(errors);
6933
6934 switch (error_handler)
6935 {
6936 case _Py_ERROR_REPLACE:
6937 case _Py_ERROR_SURROGATEESCAPE:
6938 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006939 but we may switch to UCS2 at the first write */
6940 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6941 goto onError;
6942 kind = writer.kind;
6943 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006944
6945 if (error_handler == _Py_ERROR_REPLACE)
6946 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6947 else
6948 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6949 writer.pos++;
6950 ++s;
6951 break;
6952
6953 case _Py_ERROR_IGNORE:
6954 ++s;
6955 break;
6956
6957 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006958 startinpos = s-starts;
6959 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006960 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006961 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006962 "ascii", "ordinal not in range(128)",
6963 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006964 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006966 kind = writer.kind;
6967 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006970 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006971 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006972 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006973
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006975 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006976 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006977 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978 return NULL;
6979}
6980
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006981/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006982PyObject *
6983PyUnicode_EncodeASCII(const Py_UNICODE *p,
6984 Py_ssize_t size,
6985 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006987 PyObject *result;
6988 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6989 if (unicode == NULL)
6990 return NULL;
6991 result = unicode_encode_ucs1(unicode, errors, 128);
6992 Py_DECREF(unicode);
6993 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994}
6995
Alexander Belopolsky40018472011-02-26 01:02:56 +00006996PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006997_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998{
6999 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007000 PyErr_BadArgument();
7001 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007003 if (PyUnicode_READY(unicode) == -1)
7004 return NULL;
7005 /* Fast path: if it is an ASCII-only string, construct bytes object
7006 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007007 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007008 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7009 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007010 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007011}
7012
7013PyObject *
7014PyUnicode_AsASCIIString(PyObject *unicode)
7015{
7016 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017}
7018
Steve Dowercc16be82016-09-08 10:35:16 -07007019#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007020
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007021/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007022
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007023#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007024#define NEED_RETRY
7025#endif
7026
Victor Stinner3a50e702011-10-18 21:21:00 +02007027#ifndef WC_ERR_INVALID_CHARS
7028# define WC_ERR_INVALID_CHARS 0x0080
7029#endif
7030
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007031static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007032code_page_name(UINT code_page, PyObject **obj)
7033{
7034 *obj = NULL;
7035 if (code_page == CP_ACP)
7036 return "mbcs";
7037 if (code_page == CP_UTF7)
7038 return "CP_UTF7";
7039 if (code_page == CP_UTF8)
7040 return "CP_UTF8";
7041
7042 *obj = PyBytes_FromFormat("cp%u", code_page);
7043 if (*obj == NULL)
7044 return NULL;
7045 return PyBytes_AS_STRING(*obj);
7046}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007047
Victor Stinner3a50e702011-10-18 21:21:00 +02007048static DWORD
7049decode_code_page_flags(UINT code_page)
7050{
7051 if (code_page == CP_UTF7) {
7052 /* The CP_UTF7 decoder only supports flags=0 */
7053 return 0;
7054 }
7055 else
7056 return MB_ERR_INVALID_CHARS;
7057}
7058
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007059/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007060 * Decode a byte string from a Windows code page into unicode object in strict
7061 * mode.
7062 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007063 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7064 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007065 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007066static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007067decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007068 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007069 const char *in,
7070 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007071{
Victor Stinner3a50e702011-10-18 21:21:00 +02007072 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007073 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007074 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007075
7076 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007077 assert(insize > 0);
7078 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7079 if (outsize <= 0)
7080 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007081
7082 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007084 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007085 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007086 if (*v == NULL)
7087 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007088 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007089 }
7090 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007091 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007092 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007093 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007095 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007096 }
7097
7098 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007099 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7100 if (outsize <= 0)
7101 goto error;
7102 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007103
Victor Stinner3a50e702011-10-18 21:21:00 +02007104error:
7105 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7106 return -2;
7107 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007108 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007109}
7110
Victor Stinner3a50e702011-10-18 21:21:00 +02007111/*
7112 * Decode a byte string from a code page into unicode object with an error
7113 * handler.
7114 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007115 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007116 * UnicodeDecodeError exception and returns -1 on error.
7117 */
7118static int
7119decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007120 PyObject **v,
7121 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007122 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007123{
7124 const char *startin = in;
7125 const char *endin = in + size;
7126 const DWORD flags = decode_code_page_flags(code_page);
7127 /* Ideally, we should get reason from FormatMessage. This is the Windows
7128 2000 English version of the message. */
7129 const char *reason = "No mapping for the Unicode character exists "
7130 "in the target code page.";
7131 /* each step cannot decode more than 1 character, but a character can be
7132 represented as a surrogate pair */
7133 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007134 int insize;
7135 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007136 PyObject *errorHandler = NULL;
7137 PyObject *exc = NULL;
7138 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007139 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 DWORD err;
7141 int ret = -1;
7142
7143 assert(size > 0);
7144
7145 encoding = code_page_name(code_page, &encoding_obj);
7146 if (encoding == NULL)
7147 return -1;
7148
Victor Stinner7d00cc12014-03-17 23:08:06 +01007149 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7151 UnicodeDecodeError. */
7152 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7153 if (exc != NULL) {
7154 PyCodec_StrictErrors(exc);
7155 Py_CLEAR(exc);
7156 }
7157 goto error;
7158 }
7159
7160 if (*v == NULL) {
7161 /* Create unicode object */
7162 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7163 PyErr_NoMemory();
7164 goto error;
7165 }
Victor Stinnerab595942011-12-17 04:59:06 +01007166 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007167 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 if (*v == NULL)
7169 goto error;
7170 startout = PyUnicode_AS_UNICODE(*v);
7171 }
7172 else {
7173 /* Extend unicode object */
7174 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7175 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7176 PyErr_NoMemory();
7177 goto error;
7178 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007179 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007180 goto error;
7181 startout = PyUnicode_AS_UNICODE(*v) + n;
7182 }
7183
7184 /* Decode the byte string character per character */
7185 out = startout;
7186 while (in < endin)
7187 {
7188 /* Decode a character */
7189 insize = 1;
7190 do
7191 {
7192 outsize = MultiByteToWideChar(code_page, flags,
7193 in, insize,
7194 buffer, Py_ARRAY_LENGTH(buffer));
7195 if (outsize > 0)
7196 break;
7197 err = GetLastError();
7198 if (err != ERROR_NO_UNICODE_TRANSLATION
7199 && err != ERROR_INSUFFICIENT_BUFFER)
7200 {
7201 PyErr_SetFromWindowsErr(0);
7202 goto error;
7203 }
7204 insize++;
7205 }
7206 /* 4=maximum length of a UTF-8 sequence */
7207 while (insize <= 4 && (in + insize) <= endin);
7208
7209 if (outsize <= 0) {
7210 Py_ssize_t startinpos, endinpos, outpos;
7211
Victor Stinner7d00cc12014-03-17 23:08:06 +01007212 /* last character in partial decode? */
7213 if (in + insize >= endin && !final)
7214 break;
7215
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 startinpos = in - startin;
7217 endinpos = startinpos + 1;
7218 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007219 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 errors, &errorHandler,
7221 encoding, reason,
7222 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007223 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007224 {
7225 goto error;
7226 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007227 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007228 }
7229 else {
7230 in += insize;
7231 memcpy(out, buffer, outsize * sizeof(wchar_t));
7232 out += outsize;
7233 }
7234 }
7235
7236 /* write a NUL character at the end */
7237 *out = 0;
7238
7239 /* Extend unicode object */
7240 outsize = out - startout;
7241 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007242 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007243 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007244 /* (in - startin) <= size and size is an int */
7245 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007246
7247error:
7248 Py_XDECREF(encoding_obj);
7249 Py_XDECREF(errorHandler);
7250 Py_XDECREF(exc);
7251 return ret;
7252}
7253
Victor Stinner3a50e702011-10-18 21:21:00 +02007254static PyObject *
7255decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007256 const char *s, Py_ssize_t size,
7257 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007258{
Victor Stinner76a31a62011-11-04 00:05:13 +01007259 PyObject *v = NULL;
7260 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007261
Victor Stinner3a50e702011-10-18 21:21:00 +02007262 if (code_page < 0) {
7263 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7264 return NULL;
7265 }
7266
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007267 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007268 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007269
Victor Stinner76a31a62011-11-04 00:05:13 +01007270 do
7271 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007272#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007273 if (size > INT_MAX) {
7274 chunk_size = INT_MAX;
7275 final = 0;
7276 done = 0;
7277 }
7278 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007279#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007280 {
7281 chunk_size = (int)size;
7282 final = (consumed == NULL);
7283 done = 1;
7284 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007285
Victor Stinner76a31a62011-11-04 00:05:13 +01007286 if (chunk_size == 0 && done) {
7287 if (v != NULL)
7288 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007289 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007290 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007291
Victor Stinner76a31a62011-11-04 00:05:13 +01007292 converted = decode_code_page_strict(code_page, &v,
7293 s, chunk_size);
7294 if (converted == -2)
7295 converted = decode_code_page_errors(code_page, &v,
7296 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007297 errors, final);
7298 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007299
7300 if (converted < 0) {
7301 Py_XDECREF(v);
7302 return NULL;
7303 }
7304
7305 if (consumed)
7306 *consumed += converted;
7307
7308 s += converted;
7309 size -= converted;
7310 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007311
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007312 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007313}
7314
Alexander Belopolsky40018472011-02-26 01:02:56 +00007315PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007316PyUnicode_DecodeCodePageStateful(int code_page,
7317 const char *s,
7318 Py_ssize_t size,
7319 const char *errors,
7320 Py_ssize_t *consumed)
7321{
7322 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7323}
7324
7325PyObject *
7326PyUnicode_DecodeMBCSStateful(const char *s,
7327 Py_ssize_t size,
7328 const char *errors,
7329 Py_ssize_t *consumed)
7330{
7331 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7332}
7333
7334PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007335PyUnicode_DecodeMBCS(const char *s,
7336 Py_ssize_t size,
7337 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007338{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007339 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7340}
7341
Victor Stinner3a50e702011-10-18 21:21:00 +02007342static DWORD
7343encode_code_page_flags(UINT code_page, const char *errors)
7344{
7345 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007346 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007347 }
7348 else if (code_page == CP_UTF7) {
7349 /* CP_UTF7 only supports flags=0 */
7350 return 0;
7351 }
7352 else {
7353 if (errors != NULL && strcmp(errors, "replace") == 0)
7354 return 0;
7355 else
7356 return WC_NO_BEST_FIT_CHARS;
7357 }
7358}
7359
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007360/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007361 * Encode a Unicode string to a Windows code page into a byte string in strict
7362 * mode.
7363 *
7364 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007365 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007366 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007367static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007368encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007369 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007370 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007371{
Victor Stinner554f3f02010-06-16 23:33:54 +00007372 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007373 BOOL *pusedDefaultChar = &usedDefaultChar;
7374 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007375 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007376 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007377 const DWORD flags = encode_code_page_flags(code_page, NULL);
7378 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007379 /* Create a substring so that we can get the UTF-16 representation
7380 of just the slice under consideration. */
7381 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007382
Martin v. Löwis3d325192011-11-04 18:23:06 +01007383 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007384
Victor Stinner3a50e702011-10-18 21:21:00 +02007385 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007386 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007387 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007388 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007389
Victor Stinner2fc507f2011-11-04 20:06:39 +01007390 substring = PyUnicode_Substring(unicode, offset, offset+len);
7391 if (substring == NULL)
7392 return -1;
7393 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7394 if (p == NULL) {
7395 Py_DECREF(substring);
7396 return -1;
7397 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007398 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007399
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007400 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007401 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007402 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007403 NULL, 0,
7404 NULL, pusedDefaultChar);
7405 if (outsize <= 0)
7406 goto error;
7407 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007408 if (pusedDefaultChar && *pusedDefaultChar) {
7409 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007410 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007411 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007412
Victor Stinner3a50e702011-10-18 21:21:00 +02007413 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007415 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007416 if (*outbytes == NULL) {
7417 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007419 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007420 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007421 }
7422 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007423 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 const Py_ssize_t n = PyBytes_Size(*outbytes);
7425 if (outsize > PY_SSIZE_T_MAX - n) {
7426 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007427 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007428 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007430 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7431 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007433 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007435 }
7436
7437 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007439 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007440 out, outsize,
7441 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007442 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 if (outsize <= 0)
7444 goto error;
7445 if (pusedDefaultChar && *pusedDefaultChar)
7446 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007447 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007448
Victor Stinner3a50e702011-10-18 21:21:00 +02007449error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007450 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007451 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7452 return -2;
7453 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007454 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007455}
7456
Victor Stinner3a50e702011-10-18 21:21:00 +02007457/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007458 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 * error handler.
7460 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007461 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 * -1 on other error.
7463 */
7464static int
7465encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007466 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007467 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007468{
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007470 Py_ssize_t pos = unicode_offset;
7471 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007472 /* Ideally, we should get reason from FormatMessage. This is the Windows
7473 2000 English version of the message. */
7474 const char *reason = "invalid character";
7475 /* 4=maximum length of a UTF-8 sequence */
7476 char buffer[4];
7477 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7478 Py_ssize_t outsize;
7479 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 PyObject *errorHandler = NULL;
7481 PyObject *exc = NULL;
7482 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007483 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007484 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 PyObject *rep;
7486 int ret = -1;
7487
7488 assert(insize > 0);
7489
7490 encoding = code_page_name(code_page, &encoding_obj);
7491 if (encoding == NULL)
7492 return -1;
7493
7494 if (errors == NULL || strcmp(errors, "strict") == 0) {
7495 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7496 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007497 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007498 if (exc != NULL) {
7499 PyCodec_StrictErrors(exc);
7500 Py_DECREF(exc);
7501 }
7502 Py_XDECREF(encoding_obj);
7503 return -1;
7504 }
7505
7506 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7507 pusedDefaultChar = &usedDefaultChar;
7508 else
7509 pusedDefaultChar = NULL;
7510
7511 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7512 PyErr_NoMemory();
7513 goto error;
7514 }
7515 outsize = insize * Py_ARRAY_LENGTH(buffer);
7516
7517 if (*outbytes == NULL) {
7518 /* Create string object */
7519 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7520 if (*outbytes == NULL)
7521 goto error;
7522 out = PyBytes_AS_STRING(*outbytes);
7523 }
7524 else {
7525 /* Extend string object */
7526 Py_ssize_t n = PyBytes_Size(*outbytes);
7527 if (n > PY_SSIZE_T_MAX - outsize) {
7528 PyErr_NoMemory();
7529 goto error;
7530 }
7531 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7532 goto error;
7533 out = PyBytes_AS_STRING(*outbytes) + n;
7534 }
7535
7536 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007537 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007538 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007539 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7540 wchar_t chars[2];
7541 int charsize;
7542 if (ch < 0x10000) {
7543 chars[0] = (wchar_t)ch;
7544 charsize = 1;
7545 }
7546 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007547 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7548 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007549 charsize = 2;
7550 }
7551
Victor Stinner3a50e702011-10-18 21:21:00 +02007552 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007553 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007554 buffer, Py_ARRAY_LENGTH(buffer),
7555 NULL, pusedDefaultChar);
7556 if (outsize > 0) {
7557 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7558 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007559 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007560 memcpy(out, buffer, outsize);
7561 out += outsize;
7562 continue;
7563 }
7564 }
7565 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7566 PyErr_SetFromWindowsErr(0);
7567 goto error;
7568 }
7569
Victor Stinner3a50e702011-10-18 21:21:00 +02007570 rep = unicode_encode_call_errorhandler(
7571 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007572 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007573 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007574 if (rep == NULL)
7575 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007576 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007577
7578 if (PyBytes_Check(rep)) {
7579 outsize = PyBytes_GET_SIZE(rep);
7580 if (outsize != 1) {
7581 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7582 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7583 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7584 Py_DECREF(rep);
7585 goto error;
7586 }
7587 out = PyBytes_AS_STRING(*outbytes) + offset;
7588 }
7589 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7590 out += outsize;
7591 }
7592 else {
7593 Py_ssize_t i;
7594 enum PyUnicode_Kind kind;
7595 void *data;
7596
Benjamin Petersonbac79492012-01-14 13:34:47 -05007597 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007598 Py_DECREF(rep);
7599 goto error;
7600 }
7601
7602 outsize = PyUnicode_GET_LENGTH(rep);
7603 if (outsize != 1) {
7604 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7605 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7606 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7607 Py_DECREF(rep);
7608 goto error;
7609 }
7610 out = PyBytes_AS_STRING(*outbytes) + offset;
7611 }
7612 kind = PyUnicode_KIND(rep);
7613 data = PyUnicode_DATA(rep);
7614 for (i=0; i < outsize; i++) {
7615 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7616 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007617 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007618 encoding, unicode,
7619 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007620 "unable to encode error handler result to ASCII");
7621 Py_DECREF(rep);
7622 goto error;
7623 }
7624 *out = (unsigned char)ch;
7625 out++;
7626 }
7627 }
7628 Py_DECREF(rep);
7629 }
7630 /* write a NUL byte */
7631 *out = 0;
7632 outsize = out - PyBytes_AS_STRING(*outbytes);
7633 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7634 if (_PyBytes_Resize(outbytes, outsize) < 0)
7635 goto error;
7636 ret = 0;
7637
7638error:
7639 Py_XDECREF(encoding_obj);
7640 Py_XDECREF(errorHandler);
7641 Py_XDECREF(exc);
7642 return ret;
7643}
7644
Victor Stinner3a50e702011-10-18 21:21:00 +02007645static PyObject *
7646encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007647 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007648 const char *errors)
7649{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007650 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007651 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007652 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007653 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007654
Victor Stinner29dacf22015-01-26 16:41:32 +01007655 if (!PyUnicode_Check(unicode)) {
7656 PyErr_BadArgument();
7657 return NULL;
7658 }
7659
Benjamin Petersonbac79492012-01-14 13:34:47 -05007660 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007661 return NULL;
7662 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007663
Victor Stinner3a50e702011-10-18 21:21:00 +02007664 if (code_page < 0) {
7665 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7666 return NULL;
7667 }
7668
Martin v. Löwis3d325192011-11-04 18:23:06 +01007669 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007670 return PyBytes_FromStringAndSize(NULL, 0);
7671
Victor Stinner7581cef2011-11-03 22:32:33 +01007672 offset = 0;
7673 do
7674 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007675#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007676 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007677 chunks. */
7678 if (len > INT_MAX/2) {
7679 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007680 done = 0;
7681 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007682 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007683#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007684 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007685 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007686 done = 1;
7687 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007688
Victor Stinner76a31a62011-11-04 00:05:13 +01007689 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007690 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007691 errors);
7692 if (ret == -2)
7693 ret = encode_code_page_errors(code_page, &outbytes,
7694 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007695 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007696 if (ret < 0) {
7697 Py_XDECREF(outbytes);
7698 return NULL;
7699 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007700
Victor Stinner7581cef2011-11-03 22:32:33 +01007701 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007702 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007703 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007704
Victor Stinner3a50e702011-10-18 21:21:00 +02007705 return outbytes;
7706}
7707
7708PyObject *
7709PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7710 Py_ssize_t size,
7711 const char *errors)
7712{
Victor Stinner7581cef2011-11-03 22:32:33 +01007713 PyObject *unicode, *res;
7714 unicode = PyUnicode_FromUnicode(p, size);
7715 if (unicode == NULL)
7716 return NULL;
7717 res = encode_code_page(CP_ACP, unicode, errors);
7718 Py_DECREF(unicode);
7719 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007720}
7721
7722PyObject *
7723PyUnicode_EncodeCodePage(int code_page,
7724 PyObject *unicode,
7725 const char *errors)
7726{
Victor Stinner7581cef2011-11-03 22:32:33 +01007727 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007728}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007729
Alexander Belopolsky40018472011-02-26 01:02:56 +00007730PyObject *
7731PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007732{
Victor Stinner7581cef2011-11-03 22:32:33 +01007733 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007734}
7735
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007736#undef NEED_RETRY
7737
Steve Dowercc16be82016-09-08 10:35:16 -07007738#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007739
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740/* --- Character Mapping Codec -------------------------------------------- */
7741
Victor Stinnerfb161b12013-04-18 01:44:27 +02007742static int
7743charmap_decode_string(const char *s,
7744 Py_ssize_t size,
7745 PyObject *mapping,
7746 const char *errors,
7747 _PyUnicodeWriter *writer)
7748{
7749 const char *starts = s;
7750 const char *e;
7751 Py_ssize_t startinpos, endinpos;
7752 PyObject *errorHandler = NULL, *exc = NULL;
7753 Py_ssize_t maplen;
7754 enum PyUnicode_Kind mapkind;
7755 void *mapdata;
7756 Py_UCS4 x;
7757 unsigned char ch;
7758
7759 if (PyUnicode_READY(mapping) == -1)
7760 return -1;
7761
7762 maplen = PyUnicode_GET_LENGTH(mapping);
7763 mapdata = PyUnicode_DATA(mapping);
7764 mapkind = PyUnicode_KIND(mapping);
7765
7766 e = s + size;
7767
7768 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7769 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7770 * is disabled in encoding aliases, latin1 is preferred because
7771 * its implementation is faster. */
7772 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7773 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7774 Py_UCS4 maxchar = writer->maxchar;
7775
7776 assert (writer->kind == PyUnicode_1BYTE_KIND);
7777 while (s < e) {
7778 ch = *s;
7779 x = mapdata_ucs1[ch];
7780 if (x > maxchar) {
7781 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7782 goto onError;
7783 maxchar = writer->maxchar;
7784 outdata = (Py_UCS1 *)writer->data;
7785 }
7786 outdata[writer->pos] = x;
7787 writer->pos++;
7788 ++s;
7789 }
7790 return 0;
7791 }
7792
7793 while (s < e) {
7794 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7795 enum PyUnicode_Kind outkind = writer->kind;
7796 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7797 if (outkind == PyUnicode_1BYTE_KIND) {
7798 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7799 Py_UCS4 maxchar = writer->maxchar;
7800 while (s < e) {
7801 ch = *s;
7802 x = mapdata_ucs2[ch];
7803 if (x > maxchar)
7804 goto Error;
7805 outdata[writer->pos] = x;
7806 writer->pos++;
7807 ++s;
7808 }
7809 break;
7810 }
7811 else if (outkind == PyUnicode_2BYTE_KIND) {
7812 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7813 while (s < e) {
7814 ch = *s;
7815 x = mapdata_ucs2[ch];
7816 if (x == 0xFFFE)
7817 goto Error;
7818 outdata[writer->pos] = x;
7819 writer->pos++;
7820 ++s;
7821 }
7822 break;
7823 }
7824 }
7825 ch = *s;
7826
7827 if (ch < maplen)
7828 x = PyUnicode_READ(mapkind, mapdata, ch);
7829 else
7830 x = 0xfffe; /* invalid value */
7831Error:
7832 if (x == 0xfffe)
7833 {
7834 /* undefined mapping */
7835 startinpos = s-starts;
7836 endinpos = startinpos+1;
7837 if (unicode_decode_call_errorhandler_writer(
7838 errors, &errorHandler,
7839 "charmap", "character maps to <undefined>",
7840 &starts, &e, &startinpos, &endinpos, &exc, &s,
7841 writer)) {
7842 goto onError;
7843 }
7844 continue;
7845 }
7846
7847 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7848 goto onError;
7849 ++s;
7850 }
7851 Py_XDECREF(errorHandler);
7852 Py_XDECREF(exc);
7853 return 0;
7854
7855onError:
7856 Py_XDECREF(errorHandler);
7857 Py_XDECREF(exc);
7858 return -1;
7859}
7860
7861static int
7862charmap_decode_mapping(const char *s,
7863 Py_ssize_t size,
7864 PyObject *mapping,
7865 const char *errors,
7866 _PyUnicodeWriter *writer)
7867{
7868 const char *starts = s;
7869 const char *e;
7870 Py_ssize_t startinpos, endinpos;
7871 PyObject *errorHandler = NULL, *exc = NULL;
7872 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007873 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007874
7875 e = s + size;
7876
7877 while (s < e) {
7878 ch = *s;
7879
7880 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7881 key = PyLong_FromLong((long)ch);
7882 if (key == NULL)
7883 goto onError;
7884
7885 item = PyObject_GetItem(mapping, key);
7886 Py_DECREF(key);
7887 if (item == NULL) {
7888 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7889 /* No mapping found means: mapping is undefined. */
7890 PyErr_Clear();
7891 goto Undefined;
7892 } else
7893 goto onError;
7894 }
7895
7896 /* Apply mapping */
7897 if (item == Py_None)
7898 goto Undefined;
7899 if (PyLong_Check(item)) {
7900 long value = PyLong_AS_LONG(item);
7901 if (value == 0xFFFE)
7902 goto Undefined;
7903 if (value < 0 || value > MAX_UNICODE) {
7904 PyErr_Format(PyExc_TypeError,
7905 "character mapping must be in range(0x%lx)",
7906 (unsigned long)MAX_UNICODE + 1);
7907 goto onError;
7908 }
7909
7910 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7911 goto onError;
7912 }
7913 else if (PyUnicode_Check(item)) {
7914 if (PyUnicode_READY(item) == -1)
7915 goto onError;
7916 if (PyUnicode_GET_LENGTH(item) == 1) {
7917 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7918 if (value == 0xFFFE)
7919 goto Undefined;
7920 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7921 goto onError;
7922 }
7923 else {
7924 writer->overallocate = 1;
7925 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7926 goto onError;
7927 }
7928 }
7929 else {
7930 /* wrong return value */
7931 PyErr_SetString(PyExc_TypeError,
7932 "character mapping must return integer, None or str");
7933 goto onError;
7934 }
7935 Py_CLEAR(item);
7936 ++s;
7937 continue;
7938
7939Undefined:
7940 /* undefined mapping */
7941 Py_CLEAR(item);
7942 startinpos = s-starts;
7943 endinpos = startinpos+1;
7944 if (unicode_decode_call_errorhandler_writer(
7945 errors, &errorHandler,
7946 "charmap", "character maps to <undefined>",
7947 &starts, &e, &startinpos, &endinpos, &exc, &s,
7948 writer)) {
7949 goto onError;
7950 }
7951 }
7952 Py_XDECREF(errorHandler);
7953 Py_XDECREF(exc);
7954 return 0;
7955
7956onError:
7957 Py_XDECREF(item);
7958 Py_XDECREF(errorHandler);
7959 Py_XDECREF(exc);
7960 return -1;
7961}
7962
Alexander Belopolsky40018472011-02-26 01:02:56 +00007963PyObject *
7964PyUnicode_DecodeCharmap(const char *s,
7965 Py_ssize_t size,
7966 PyObject *mapping,
7967 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007969 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007970
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971 /* Default to Latin-1 */
7972 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007976 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007977 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007978 writer.min_length = size;
7979 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007981
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007982 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007983 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7984 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007985 }
7986 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007987 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7988 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007990 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007991
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007993 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 return NULL;
7995}
7996
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007997/* Charmap encoding: the lookup table */
7998
Alexander Belopolsky40018472011-02-26 01:02:56 +00007999struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 PyObject_HEAD
8001 unsigned char level1[32];
8002 int count2, count3;
8003 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008004};
8005
8006static PyObject*
8007encoding_map_size(PyObject *obj, PyObject* args)
8008{
8009 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008010 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008012}
8013
8014static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008015 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008016 PyDoc_STR("Return the size (in bytes) of this object") },
8017 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008018};
8019
8020static void
8021encoding_map_dealloc(PyObject* o)
8022{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008023 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008024}
8025
8026static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008027 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 "EncodingMap", /*tp_name*/
8029 sizeof(struct encoding_map), /*tp_basicsize*/
8030 0, /*tp_itemsize*/
8031 /* methods */
8032 encoding_map_dealloc, /*tp_dealloc*/
8033 0, /*tp_print*/
8034 0, /*tp_getattr*/
8035 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008036 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 0, /*tp_repr*/
8038 0, /*tp_as_number*/
8039 0, /*tp_as_sequence*/
8040 0, /*tp_as_mapping*/
8041 0, /*tp_hash*/
8042 0, /*tp_call*/
8043 0, /*tp_str*/
8044 0, /*tp_getattro*/
8045 0, /*tp_setattro*/
8046 0, /*tp_as_buffer*/
8047 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8048 0, /*tp_doc*/
8049 0, /*tp_traverse*/
8050 0, /*tp_clear*/
8051 0, /*tp_richcompare*/
8052 0, /*tp_weaklistoffset*/
8053 0, /*tp_iter*/
8054 0, /*tp_iternext*/
8055 encoding_map_methods, /*tp_methods*/
8056 0, /*tp_members*/
8057 0, /*tp_getset*/
8058 0, /*tp_base*/
8059 0, /*tp_dict*/
8060 0, /*tp_descr_get*/
8061 0, /*tp_descr_set*/
8062 0, /*tp_dictoffset*/
8063 0, /*tp_init*/
8064 0, /*tp_alloc*/
8065 0, /*tp_new*/
8066 0, /*tp_free*/
8067 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008068};
8069
8070PyObject*
8071PyUnicode_BuildEncodingMap(PyObject* string)
8072{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008073 PyObject *result;
8074 struct encoding_map *mresult;
8075 int i;
8076 int need_dict = 0;
8077 unsigned char level1[32];
8078 unsigned char level2[512];
8079 unsigned char *mlevel1, *mlevel2, *mlevel3;
8080 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008081 int kind;
8082 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008083 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008084 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008085
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008086 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008087 PyErr_BadArgument();
8088 return NULL;
8089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008090 kind = PyUnicode_KIND(string);
8091 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008092 length = PyUnicode_GET_LENGTH(string);
8093 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008094 memset(level1, 0xFF, sizeof level1);
8095 memset(level2, 0xFF, sizeof level2);
8096
8097 /* If there isn't a one-to-one mapping of NULL to \0,
8098 or if there are non-BMP characters, we need to use
8099 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008100 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008101 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008102 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008103 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008104 ch = PyUnicode_READ(kind, data, i);
8105 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008106 need_dict = 1;
8107 break;
8108 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008109 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008110 /* unmapped character */
8111 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008112 l1 = ch >> 11;
8113 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008114 if (level1[l1] == 0xFF)
8115 level1[l1] = count2++;
8116 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008117 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008118 }
8119
8120 if (count2 >= 0xFF || count3 >= 0xFF)
8121 need_dict = 1;
8122
8123 if (need_dict) {
8124 PyObject *result = PyDict_New();
8125 PyObject *key, *value;
8126 if (!result)
8127 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008128 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008129 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008130 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008131 if (!key || !value)
8132 goto failed1;
8133 if (PyDict_SetItem(result, key, value) == -1)
8134 goto failed1;
8135 Py_DECREF(key);
8136 Py_DECREF(value);
8137 }
8138 return result;
8139 failed1:
8140 Py_XDECREF(key);
8141 Py_XDECREF(value);
8142 Py_DECREF(result);
8143 return NULL;
8144 }
8145
8146 /* Create a three-level trie */
8147 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8148 16*count2 + 128*count3 - 1);
8149 if (!result)
8150 return PyErr_NoMemory();
8151 PyObject_Init(result, &EncodingMapType);
8152 mresult = (struct encoding_map*)result;
8153 mresult->count2 = count2;
8154 mresult->count3 = count3;
8155 mlevel1 = mresult->level1;
8156 mlevel2 = mresult->level23;
8157 mlevel3 = mresult->level23 + 16*count2;
8158 memcpy(mlevel1, level1, 32);
8159 memset(mlevel2, 0xFF, 16*count2);
8160 memset(mlevel3, 0, 128*count3);
8161 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008162 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008163 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008164 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8165 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008166 /* unmapped character */
8167 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008168 o1 = ch>>11;
8169 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008170 i2 = 16*mlevel1[o1] + o2;
8171 if (mlevel2[i2] == 0xFF)
8172 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008173 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008174 i3 = 128*mlevel2[i2] + o3;
8175 mlevel3[i3] = i;
8176 }
8177 return result;
8178}
8179
8180static int
Victor Stinner22168992011-11-20 17:09:18 +01008181encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008182{
8183 struct encoding_map *map = (struct encoding_map*)mapping;
8184 int l1 = c>>11;
8185 int l2 = (c>>7) & 0xF;
8186 int l3 = c & 0x7F;
8187 int i;
8188
Victor Stinner22168992011-11-20 17:09:18 +01008189 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008191 if (c == 0)
8192 return 0;
8193 /* level 1*/
8194 i = map->level1[l1];
8195 if (i == 0xFF) {
8196 return -1;
8197 }
8198 /* level 2*/
8199 i = map->level23[16*i+l2];
8200 if (i == 0xFF) {
8201 return -1;
8202 }
8203 /* level 3 */
8204 i = map->level23[16*map->count2 + 128*i + l3];
8205 if (i == 0) {
8206 return -1;
8207 }
8208 return i;
8209}
8210
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008211/* Lookup the character ch in the mapping. If the character
8212 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008213 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008214static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008215charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216{
Christian Heimes217cfd12007-12-02 14:31:20 +00008217 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008218 PyObject *x;
8219
8220 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008222 x = PyObject_GetItem(mapping, w);
8223 Py_DECREF(w);
8224 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8226 /* No mapping found means: mapping is undefined. */
8227 PyErr_Clear();
8228 x = Py_None;
8229 Py_INCREF(x);
8230 return x;
8231 } else
8232 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008234 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008236 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 long value = PyLong_AS_LONG(x);
8238 if (value < 0 || value > 255) {
8239 PyErr_SetString(PyExc_TypeError,
8240 "character mapping must be in range(256)");
8241 Py_DECREF(x);
8242 return NULL;
8243 }
8244 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008246 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 /* wrong return value */
8250 PyErr_Format(PyExc_TypeError,
8251 "character mapping must return integer, bytes or None, not %.400s",
8252 x->ob_type->tp_name);
8253 Py_DECREF(x);
8254 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255 }
8256}
8257
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008258static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008259charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008260{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008261 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8262 /* exponentially overallocate to minimize reallocations */
8263 if (requiredsize < 2*outsize)
8264 requiredsize = 2*outsize;
8265 if (_PyBytes_Resize(outobj, requiredsize))
8266 return -1;
8267 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008268}
8269
Benjamin Peterson14339b62009-01-31 16:36:08 +00008270typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008271 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008272} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008274 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008275 space is available. Return a new reference to the object that
8276 was put in the output buffer, or Py_None, if the mapping was undefined
8277 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008278 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008279static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008280charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008281 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008282{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008283 PyObject *rep;
8284 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008285 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008286
Christian Heimes90aa7642007-12-19 02:45:37 +00008287 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008288 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008290 if (res == -1)
8291 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 if (outsize<requiredsize)
8293 if (charmapencode_resize(outobj, outpos, requiredsize))
8294 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008295 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 outstart[(*outpos)++] = (char)res;
8297 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008298 }
8299
8300 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008301 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008302 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008303 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 Py_DECREF(rep);
8305 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008306 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 if (PyLong_Check(rep)) {
8308 Py_ssize_t requiredsize = *outpos+1;
8309 if (outsize<requiredsize)
8310 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8311 Py_DECREF(rep);
8312 return enc_EXCEPTION;
8313 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008314 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008316 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 else {
8318 const char *repchars = PyBytes_AS_STRING(rep);
8319 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8320 Py_ssize_t requiredsize = *outpos+repsize;
8321 if (outsize<requiredsize)
8322 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8323 Py_DECREF(rep);
8324 return enc_EXCEPTION;
8325 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008326 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 memcpy(outstart + *outpos, repchars, repsize);
8328 *outpos += repsize;
8329 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008331 Py_DECREF(rep);
8332 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333}
8334
8335/* handle an error in PyUnicode_EncodeCharmap
8336 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008337static int
8338charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008339 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008340 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008341 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008342 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008343{
8344 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008345 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008346 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008347 enum PyUnicode_Kind kind;
8348 void *data;
8349 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008351 Py_ssize_t collstartpos = *inpos;
8352 Py_ssize_t collendpos = *inpos+1;
8353 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008354 char *encoding = "charmap";
8355 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008356 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008357 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008358 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359
Benjamin Petersonbac79492012-01-14 13:34:47 -05008360 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008361 return -1;
8362 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363 /* find all unencodable characters */
8364 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008365 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008366 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008367 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008368 val = encoding_map_lookup(ch, mapping);
8369 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 break;
8371 ++collendpos;
8372 continue;
8373 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008374
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008375 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8376 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 if (rep==NULL)
8378 return -1;
8379 else if (rep!=Py_None) {
8380 Py_DECREF(rep);
8381 break;
8382 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008383 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 }
8386 /* cache callback name lookup
8387 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008388 if (*error_handler == _Py_ERROR_UNKNOWN)
8389 *error_handler = get_error_handler(errors);
8390
8391 switch (*error_handler) {
8392 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008393 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008394 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008395
8396 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008397 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 x = charmapencode_output('?', mapping, res, respos);
8399 if (x==enc_EXCEPTION) {
8400 return -1;
8401 }
8402 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008403 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 return -1;
8405 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008406 }
8407 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008408 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008409 *inpos = collendpos;
8410 break;
Victor Stinner50149202015-09-22 00:26:54 +02008411
8412 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008413 /* generate replacement (temporarily (mis)uses p) */
8414 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 char buffer[2+29+1+1];
8416 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008417 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 for (cp = buffer; *cp; ++cp) {
8419 x = charmapencode_output(*cp, mapping, res, respos);
8420 if (x==enc_EXCEPTION)
8421 return -1;
8422 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008423 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 return -1;
8425 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008426 }
8427 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008428 *inpos = collendpos;
8429 break;
Victor Stinner50149202015-09-22 00:26:54 +02008430
Benjamin Peterson14339b62009-01-31 16:36:08 +00008431 default:
Victor Stinner50149202015-09-22 00:26:54 +02008432 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008433 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008435 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008437 if (PyBytes_Check(repunicode)) {
8438 /* Directly copy bytes result to output. */
8439 Py_ssize_t outsize = PyBytes_Size(*res);
8440 Py_ssize_t requiredsize;
8441 repsize = PyBytes_Size(repunicode);
8442 requiredsize = *respos + repsize;
8443 if (requiredsize > outsize)
8444 /* Make room for all additional bytes. */
8445 if (charmapencode_resize(res, respos, requiredsize)) {
8446 Py_DECREF(repunicode);
8447 return -1;
8448 }
8449 memcpy(PyBytes_AsString(*res) + *respos,
8450 PyBytes_AsString(repunicode), repsize);
8451 *respos += repsize;
8452 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008453 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008454 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008455 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008456 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008457 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008458 Py_DECREF(repunicode);
8459 return -1;
8460 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008461 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008462 data = PyUnicode_DATA(repunicode);
8463 kind = PyUnicode_KIND(repunicode);
8464 for (index = 0; index < repsize; index++) {
8465 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8466 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008468 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 return -1;
8470 }
8471 else if (x==enc_FAILED) {
8472 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008473 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 return -1;
8475 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008476 }
8477 *inpos = newpos;
8478 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008479 }
8480 return 0;
8481}
8482
Alexander Belopolsky40018472011-02-26 01:02:56 +00008483PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008484_PyUnicode_EncodeCharmap(PyObject *unicode,
8485 PyObject *mapping,
8486 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008488 /* output object */
8489 PyObject *res = NULL;
8490 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008491 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008492 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008494 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008495 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008496 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008497 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008498 void *data;
8499 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500
Benjamin Petersonbac79492012-01-14 13:34:47 -05008501 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008502 return NULL;
8503 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008504 data = PyUnicode_DATA(unicode);
8505 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008506
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 /* Default to Latin-1 */
8508 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008509 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008511 /* allocate enough for a simple encoding without
8512 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008513 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008514 if (res == NULL)
8515 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008516 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008520 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008522 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 if (x==enc_EXCEPTION) /* error */
8524 goto onError;
8525 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008526 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008528 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 &res, &respos)) {
8530 goto onError;
8531 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008532 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 else
8534 /* done with this character => adjust input position */
8535 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008539 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008540 if (_PyBytes_Resize(&res, respos) < 0)
8541 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008542
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008543 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008544 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008545 return res;
8546
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008548 Py_XDECREF(res);
8549 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008550 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551 return NULL;
8552}
8553
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008554/* Deprecated */
8555PyObject *
8556PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8557 Py_ssize_t size,
8558 PyObject *mapping,
8559 const char *errors)
8560{
8561 PyObject *result;
8562 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8563 if (unicode == NULL)
8564 return NULL;
8565 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8566 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008567 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008568}
8569
Alexander Belopolsky40018472011-02-26 01:02:56 +00008570PyObject *
8571PyUnicode_AsCharmapString(PyObject *unicode,
8572 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573{
8574 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 PyErr_BadArgument();
8576 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008578 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579}
8580
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008581/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008582static void
8583make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008585 Py_ssize_t startpos, Py_ssize_t endpos,
8586 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008588 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 *exceptionObject = _PyUnicodeTranslateError_Create(
8590 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591 }
8592 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008593 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8594 goto onError;
8595 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8596 goto onError;
8597 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8598 goto onError;
8599 return;
8600 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008601 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602 }
8603}
8604
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008605/* error handling callback helper:
8606 build arguments, call the callback and check the arguments,
8607 put the result into newpos and return the replacement string, which
8608 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008609static PyObject *
8610unicode_translate_call_errorhandler(const char *errors,
8611 PyObject **errorHandler,
8612 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008614 Py_ssize_t startpos, Py_ssize_t endpos,
8615 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008616{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008617 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008618
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008619 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008620 PyObject *restuple;
8621 PyObject *resunicode;
8622
8623 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008627 }
8628
8629 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008631 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633
8634 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008636 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008638 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008639 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 Py_DECREF(restuple);
8641 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008643 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 &resunicode, &i_newpos)) {
8645 Py_DECREF(restuple);
8646 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008648 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008650 else
8651 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008653 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 Py_DECREF(restuple);
8655 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008656 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008657 Py_INCREF(resunicode);
8658 Py_DECREF(restuple);
8659 return resunicode;
8660}
8661
8662/* Lookup the character ch in the mapping and put the result in result,
8663 which must be decrefed by the caller.
8664 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008665static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008667{
Christian Heimes217cfd12007-12-02 14:31:20 +00008668 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 PyObject *x;
8670
8671 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008673 x = PyObject_GetItem(mapping, w);
8674 Py_DECREF(w);
8675 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8677 /* No mapping found means: use 1:1 mapping. */
8678 PyErr_Clear();
8679 *result = NULL;
8680 return 0;
8681 } else
8682 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 }
8684 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 *result = x;
8686 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008688 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008690 if (value < 0 || value > MAX_UNICODE) {
8691 PyErr_Format(PyExc_ValueError,
8692 "character mapping must be in range(0x%x)",
8693 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 Py_DECREF(x);
8695 return -1;
8696 }
8697 *result = x;
8698 return 0;
8699 }
8700 else if (PyUnicode_Check(x)) {
8701 *result = x;
8702 return 0;
8703 }
8704 else {
8705 /* wrong return value */
8706 PyErr_SetString(PyExc_TypeError,
8707 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008708 Py_DECREF(x);
8709 return -1;
8710 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711}
Victor Stinner1194ea02014-04-04 19:37:40 +02008712
8713/* lookup the character, write the result into the writer.
8714 Return 1 if the result was written into the writer, return 0 if the mapping
8715 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008716static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008717charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8718 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008719{
Victor Stinner1194ea02014-04-04 19:37:40 +02008720 PyObject *item;
8721
8722 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008724
8725 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008727 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008728 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008730 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008731 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008732
8733 if (item == Py_None) {
8734 Py_DECREF(item);
8735 return 0;
8736 }
8737
8738 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008739 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8740 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8741 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008742 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8743 Py_DECREF(item);
8744 return -1;
8745 }
8746 Py_DECREF(item);
8747 return 1;
8748 }
8749
8750 if (!PyUnicode_Check(item)) {
8751 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008753 }
8754
8755 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8756 Py_DECREF(item);
8757 return -1;
8758 }
8759
8760 Py_DECREF(item);
8761 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008762}
8763
Victor Stinner89a76ab2014-04-05 11:44:04 +02008764static int
8765unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8766 Py_UCS1 *translate)
8767{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008768 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008769 int ret = 0;
8770
Victor Stinner89a76ab2014-04-05 11:44:04 +02008771 if (charmaptranslate_lookup(ch, mapping, &item)) {
8772 return -1;
8773 }
8774
8775 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008776 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008777 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008778 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008779 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008780 /* not found => default to 1:1 mapping */
8781 translate[ch] = ch;
8782 return 1;
8783 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008784 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008785 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008786 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8787 used it */
8788 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008789 /* invalid character or character outside ASCII:
8790 skip the fast translate */
8791 goto exit;
8792 }
8793 translate[ch] = (Py_UCS1)replace;
8794 }
8795 else if (PyUnicode_Check(item)) {
8796 Py_UCS4 replace;
8797
8798 if (PyUnicode_READY(item) == -1) {
8799 Py_DECREF(item);
8800 return -1;
8801 }
8802 if (PyUnicode_GET_LENGTH(item) != 1)
8803 goto exit;
8804
8805 replace = PyUnicode_READ_CHAR(item, 0);
8806 if (replace > 127)
8807 goto exit;
8808 translate[ch] = (Py_UCS1)replace;
8809 }
8810 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008811 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008812 goto exit;
8813 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008814 ret = 1;
8815
Benjamin Peterson1365de72014-04-07 20:15:41 -04008816 exit:
8817 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008818 return ret;
8819}
8820
8821/* Fast path for ascii => ascii translation. Return 1 if the whole string
8822 was translated into writer, return 0 if the input string was partially
8823 translated into writer, raise an exception and return -1 on error. */
8824static int
8825unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008826 _PyUnicodeWriter *writer, int ignore,
8827 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008828{
Victor Stinner872b2912014-04-05 14:27:07 +02008829 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008830 Py_ssize_t len;
8831 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008832 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008833
Victor Stinner89a76ab2014-04-05 11:44:04 +02008834 len = PyUnicode_GET_LENGTH(input);
8835
Victor Stinner872b2912014-04-05 14:27:07 +02008836 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008837
8838 in = PyUnicode_1BYTE_DATA(input);
8839 end = in + len;
8840
8841 assert(PyUnicode_IS_ASCII(writer->buffer));
8842 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8843 out = PyUnicode_1BYTE_DATA(writer->buffer);
8844
Victor Stinner872b2912014-04-05 14:27:07 +02008845 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008846 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008847 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008848 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008849 int translate = unicode_fast_translate_lookup(mapping, ch,
8850 ascii_table);
8851 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008852 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008853 if (translate == 0)
8854 goto exit;
8855 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008856 }
Victor Stinner872b2912014-04-05 14:27:07 +02008857 if (ch2 == 0xfe) {
8858 if (ignore)
8859 continue;
8860 goto exit;
8861 }
8862 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008863 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008864 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008865 }
Victor Stinner872b2912014-04-05 14:27:07 +02008866 res = 1;
8867
8868exit:
8869 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008870 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008871 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008872}
8873
Victor Stinner3222da22015-10-01 22:07:32 +02008874static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875_PyUnicode_TranslateCharmap(PyObject *input,
8876 PyObject *mapping,
8877 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008879 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008880 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008881 Py_ssize_t size, i;
8882 int kind;
8883 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008884 _PyUnicodeWriter writer;
8885 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008886 char *reason = "character maps to <undefined>";
8887 PyObject *errorHandler = NULL;
8888 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008889 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008890 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008891
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 PyErr_BadArgument();
8894 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008897 if (PyUnicode_READY(input) == -1)
8898 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008899 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 kind = PyUnicode_KIND(input);
8901 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008903 if (size == 0)
8904 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008905
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008906 /* allocate enough for a simple 1:1 translation without
8907 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008908 _PyUnicodeWriter_Init(&writer);
8909 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911
Victor Stinner872b2912014-04-05 14:27:07 +02008912 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8913
Victor Stinner33798672016-03-01 21:59:58 +01008914 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008915 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008916 if (PyUnicode_IS_ASCII(input)) {
8917 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8918 if (res < 0) {
8919 _PyUnicodeWriter_Dealloc(&writer);
8920 return NULL;
8921 }
8922 if (res == 1)
8923 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008924 }
Victor Stinner33798672016-03-01 21:59:58 +01008925 else {
8926 i = 0;
8927 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008930 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008931 int translate;
8932 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8933 Py_ssize_t newpos;
8934 /* startpos for collecting untranslatable chars */
8935 Py_ssize_t collstart;
8936 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008937 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938
Victor Stinner1194ea02014-04-04 19:37:40 +02008939 ch = PyUnicode_READ(kind, data, i);
8940 translate = charmaptranslate_output(ch, mapping, &writer);
8941 if (translate < 0)
8942 goto onError;
8943
8944 if (translate != 0) {
8945 /* it worked => adjust input pointer */
8946 ++i;
8947 continue;
8948 }
8949
8950 /* untranslatable character */
8951 collstart = i;
8952 collend = i+1;
8953
8954 /* find all untranslatable characters */
8955 while (collend < size) {
8956 PyObject *x;
8957 ch = PyUnicode_READ(kind, data, collend);
8958 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008959 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008960 Py_XDECREF(x);
8961 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008962 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008963 ++collend;
8964 }
8965
8966 if (ignore) {
8967 i = collend;
8968 }
8969 else {
8970 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8971 reason, input, &exc,
8972 collstart, collend, &newpos);
8973 if (repunicode == NULL)
8974 goto onError;
8975 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008976 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008977 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008978 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008979 Py_DECREF(repunicode);
8980 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008981 }
8982 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008983 Py_XDECREF(exc);
8984 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008985 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986
Benjamin Peterson29060642009-01-31 22:14:21 +00008987 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008988 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008989 Py_XDECREF(exc);
8990 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991 return NULL;
8992}
8993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008994/* Deprecated. Use PyUnicode_Translate instead. */
8995PyObject *
8996PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8997 Py_ssize_t size,
8998 PyObject *mapping,
8999 const char *errors)
9000{
Christian Heimes5f520f42012-09-11 14:03:25 +02009001 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002 PyObject *unicode = PyUnicode_FromUnicode(p, size);
9003 if (!unicode)
9004 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009005 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9006 Py_DECREF(unicode);
9007 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008}
9009
Alexander Belopolsky40018472011-02-26 01:02:56 +00009010PyObject *
9011PyUnicode_Translate(PyObject *str,
9012 PyObject *mapping,
9013 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009015 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009016 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009017 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018}
Tim Petersced69f82003-09-16 20:30:58 +00009019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009020static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009021fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022{
9023 /* No need to call PyUnicode_READY(self) because this function is only
9024 called as a callback from fixup() which does it already. */
9025 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9026 const int kind = PyUnicode_KIND(self);
9027 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009028 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009029 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 Py_ssize_t i;
9031
9032 for (i = 0; i < len; ++i) {
9033 ch = PyUnicode_READ(kind, data, i);
9034 fixed = 0;
9035 if (ch > 127) {
9036 if (Py_UNICODE_ISSPACE(ch))
9037 fixed = ' ';
9038 else {
9039 const int decimal = Py_UNICODE_TODECIMAL(ch);
9040 if (decimal >= 0)
9041 fixed = '0' + decimal;
9042 }
9043 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009044 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009045 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046 PyUnicode_WRITE(kind, data, i, fixed);
9047 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009048 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009049 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051 }
9052
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009053 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054}
9055
9056PyObject *
9057_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9058{
9059 if (!PyUnicode_Check(unicode)) {
9060 PyErr_BadInternalCall();
9061 return NULL;
9062 }
9063 if (PyUnicode_READY(unicode) == -1)
9064 return NULL;
9065 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9066 /* If the string is already ASCII, just return the same string */
9067 Py_INCREF(unicode);
9068 return unicode;
9069 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009070 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071}
9072
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009073PyObject *
9074PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9075 Py_ssize_t length)
9076{
Victor Stinnerf0124502011-11-21 23:12:56 +01009077 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009078 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009079 Py_UCS4 maxchar;
9080 enum PyUnicode_Kind kind;
9081 void *data;
9082
Victor Stinner99d7ad02012-02-22 13:37:39 +01009083 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009084 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009085 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009086 if (ch > 127) {
9087 int decimal = Py_UNICODE_TODECIMAL(ch);
9088 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009089 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009090 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009091 }
9092 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009093
9094 /* Copy to a new string */
9095 decimal = PyUnicode_New(length, maxchar);
9096 if (decimal == NULL)
9097 return decimal;
9098 kind = PyUnicode_KIND(decimal);
9099 data = PyUnicode_DATA(decimal);
9100 /* Iterate over code points */
9101 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009102 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009103 if (ch > 127) {
9104 int decimal = Py_UNICODE_TODECIMAL(ch);
9105 if (decimal >= 0)
9106 ch = '0' + decimal;
9107 }
9108 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009109 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009110 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009111}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009112/* --- Decimal Encoder ---------------------------------------------------- */
9113
Alexander Belopolsky40018472011-02-26 01:02:56 +00009114int
9115PyUnicode_EncodeDecimal(Py_UNICODE *s,
9116 Py_ssize_t length,
9117 char *output,
9118 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009119{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009120 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009121 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009122 enum PyUnicode_Kind kind;
9123 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009124
9125 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009126 PyErr_BadArgument();
9127 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009128 }
9129
Victor Stinner42bf7752011-11-21 22:52:58 +01009130 unicode = PyUnicode_FromUnicode(s, length);
9131 if (unicode == NULL)
9132 return -1;
9133
Benjamin Petersonbac79492012-01-14 13:34:47 -05009134 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009135 Py_DECREF(unicode);
9136 return -1;
9137 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009138 kind = PyUnicode_KIND(unicode);
9139 data = PyUnicode_DATA(unicode);
9140
Victor Stinnerb84d7232011-11-22 01:50:07 +01009141 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009142 PyObject *exc;
9143 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009144 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009145 Py_ssize_t startpos;
9146
9147 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009148
Benjamin Peterson29060642009-01-31 22:14:21 +00009149 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009150 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009151 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009153 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009154 decimal = Py_UNICODE_TODECIMAL(ch);
9155 if (decimal >= 0) {
9156 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009157 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009158 continue;
9159 }
9160 if (0 < ch && ch < 256) {
9161 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009162 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009163 continue;
9164 }
Victor Stinner6345be92011-11-25 20:09:01 +01009165
Victor Stinner42bf7752011-11-21 22:52:58 +01009166 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009167 exc = NULL;
9168 raise_encode_exception(&exc, "decimal", unicode,
9169 startpos, startpos+1,
9170 "invalid decimal Unicode string");
9171 Py_XDECREF(exc);
9172 Py_DECREF(unicode);
9173 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009174 }
9175 /* 0-terminate the output string */
9176 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009177 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009178 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009179}
9180
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181/* --- Helpers ------------------------------------------------------------ */
9182
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009183/* helper macro to fixup start/end slice values */
9184#define ADJUST_INDICES(start, end, len) \
9185 if (end > len) \
9186 end = len; \
9187 else if (end < 0) { \
9188 end += len; \
9189 if (end < 0) \
9190 end = 0; \
9191 } \
9192 if (start < 0) { \
9193 start += len; \
9194 if (start < 0) \
9195 start = 0; \
9196 }
9197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009198static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009199any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009200 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009201 Py_ssize_t end,
9202 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009203{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009204 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009205 void *buf1, *buf2;
9206 Py_ssize_t len1, len2, result;
9207
9208 kind1 = PyUnicode_KIND(s1);
9209 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009210 if (kind1 < kind2)
9211 return -1;
9212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009213 len1 = PyUnicode_GET_LENGTH(s1);
9214 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009215 ADJUST_INDICES(start, end, len1);
9216 if (end - start < len2)
9217 return -1;
9218
9219 buf1 = PyUnicode_DATA(s1);
9220 buf2 = PyUnicode_DATA(s2);
9221 if (len2 == 1) {
9222 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9223 result = findchar((const char *)buf1 + kind1*start,
9224 kind1, end - start, ch, direction);
9225 if (result == -1)
9226 return -1;
9227 else
9228 return start + result;
9229 }
9230
9231 if (kind2 != kind1) {
9232 buf2 = _PyUnicode_AsKind(s2, kind1);
9233 if (!buf2)
9234 return -2;
9235 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009236
Victor Stinner794d5672011-10-10 03:21:36 +02009237 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009238 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009239 case PyUnicode_1BYTE_KIND:
9240 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9241 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9242 else
9243 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9244 break;
9245 case PyUnicode_2BYTE_KIND:
9246 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9247 break;
9248 case PyUnicode_4BYTE_KIND:
9249 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9250 break;
9251 default:
9252 assert(0); result = -2;
9253 }
9254 }
9255 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009256 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009257 case PyUnicode_1BYTE_KIND:
9258 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9259 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9260 else
9261 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9262 break;
9263 case PyUnicode_2BYTE_KIND:
9264 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9265 break;
9266 case PyUnicode_4BYTE_KIND:
9267 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9268 break;
9269 default:
9270 assert(0); result = -2;
9271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009272 }
9273
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009274 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 PyMem_Free(buf2);
9276
9277 return result;
9278}
9279
9280Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009281_PyUnicode_InsertThousandsGrouping(
9282 PyObject *unicode, Py_ssize_t index,
9283 Py_ssize_t n_buffer,
9284 void *digits, Py_ssize_t n_digits,
9285 Py_ssize_t min_width,
9286 const char *grouping, PyObject *thousands_sep,
9287 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288{
Victor Stinner41a863c2012-02-24 00:37:51 +01009289 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009290 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009291 Py_ssize_t thousands_sep_len;
9292 Py_ssize_t len;
9293
9294 if (unicode != NULL) {
9295 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009296 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009297 }
9298 else {
9299 kind = PyUnicode_1BYTE_KIND;
9300 data = NULL;
9301 }
9302 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9303 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9304 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9305 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009306 if (thousands_sep_kind < kind) {
9307 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9308 if (!thousands_sep_data)
9309 return -1;
9310 }
9311 else {
9312 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9313 if (!data)
9314 return -1;
9315 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009316 }
9317
Benjamin Petersonead6b532011-12-20 17:23:42 -06009318 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009320 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009321 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009322 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009323 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009324 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009325 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009326 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009327 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009328 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009329 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009330 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009332 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009333 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009334 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009335 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009336 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009338 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009339 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009340 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009341 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009342 break;
9343 default:
9344 assert(0);
9345 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009347 if (unicode != NULL && thousands_sep_kind != kind) {
9348 if (thousands_sep_kind < kind)
9349 PyMem_Free(thousands_sep_data);
9350 else
9351 PyMem_Free(data);
9352 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009353 if (unicode == NULL) {
9354 *maxchar = 127;
9355 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009356 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009357 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009358 }
9359 }
9360 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361}
9362
9363
Alexander Belopolsky40018472011-02-26 01:02:56 +00009364Py_ssize_t
9365PyUnicode_Count(PyObject *str,
9366 PyObject *substr,
9367 Py_ssize_t start,
9368 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009370 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009371 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 void *buf1 = NULL, *buf2 = NULL;
9373 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009374
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009375 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009376 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009377
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009378 kind1 = PyUnicode_KIND(str);
9379 kind2 = PyUnicode_KIND(substr);
9380 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009381 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009382
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009383 len1 = PyUnicode_GET_LENGTH(str);
9384 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009386 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009387 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009388
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009389 buf1 = PyUnicode_DATA(str);
9390 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009391 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009392 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009393 if (!buf2)
9394 goto onError;
9395 }
9396
9397 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009399 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009400 result = asciilib_count(
9401 ((Py_UCS1*)buf1) + start, end - start,
9402 buf2, len2, PY_SSIZE_T_MAX
9403 );
9404 else
9405 result = ucs1lib_count(
9406 ((Py_UCS1*)buf1) + start, end - start,
9407 buf2, len2, PY_SSIZE_T_MAX
9408 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409 break;
9410 case PyUnicode_2BYTE_KIND:
9411 result = ucs2lib_count(
9412 ((Py_UCS2*)buf1) + start, end - start,
9413 buf2, len2, PY_SSIZE_T_MAX
9414 );
9415 break;
9416 case PyUnicode_4BYTE_KIND:
9417 result = ucs4lib_count(
9418 ((Py_UCS4*)buf1) + start, end - start,
9419 buf2, len2, PY_SSIZE_T_MAX
9420 );
9421 break;
9422 default:
9423 assert(0); result = 0;
9424 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009425
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009426 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 PyMem_Free(buf2);
9428
Guido van Rossumd57fd912000-03-10 22:53:23 +00009429 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009431 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 PyMem_Free(buf2);
9433 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009434}
9435
Alexander Belopolsky40018472011-02-26 01:02:56 +00009436Py_ssize_t
9437PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009438 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009439 Py_ssize_t start,
9440 Py_ssize_t end,
9441 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009442{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009443 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009444 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009445
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009446 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447}
9448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449Py_ssize_t
9450PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9451 Py_ssize_t start, Py_ssize_t end,
9452 int direction)
9453{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009455 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456 if (PyUnicode_READY(str) == -1)
9457 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009458 if (start < 0 || end < 0) {
9459 PyErr_SetString(PyExc_IndexError, "string index out of range");
9460 return -2;
9461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 if (end > PyUnicode_GET_LENGTH(str))
9463 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009464 if (start >= end)
9465 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009467 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9468 kind, end-start, ch, direction);
9469 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009471 else
9472 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473}
9474
Alexander Belopolsky40018472011-02-26 01:02:56 +00009475static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009476tailmatch(PyObject *self,
9477 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009478 Py_ssize_t start,
9479 Py_ssize_t end,
9480 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482 int kind_self;
9483 int kind_sub;
9484 void *data_self;
9485 void *data_sub;
9486 Py_ssize_t offset;
9487 Py_ssize_t i;
9488 Py_ssize_t end_sub;
9489
9490 if (PyUnicode_READY(self) == -1 ||
9491 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009492 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9495 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009497 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009499 if (PyUnicode_GET_LENGTH(substring) == 0)
9500 return 1;
9501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 kind_self = PyUnicode_KIND(self);
9503 data_self = PyUnicode_DATA(self);
9504 kind_sub = PyUnicode_KIND(substring);
9505 data_sub = PyUnicode_DATA(substring);
9506 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9507
9508 if (direction > 0)
9509 offset = end;
9510 else
9511 offset = start;
9512
9513 if (PyUnicode_READ(kind_self, data_self, offset) ==
9514 PyUnicode_READ(kind_sub, data_sub, 0) &&
9515 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9516 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9517 /* If both are of the same kind, memcmp is sufficient */
9518 if (kind_self == kind_sub) {
9519 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009520 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521 data_sub,
9522 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009523 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009525 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526 else {
9527 /* We do not need to compare 0 and len(substring)-1 because
9528 the if statement above ensured already that they are equal
9529 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 for (i = 1; i < end_sub; ++i) {
9531 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9532 PyUnicode_READ(kind_sub, data_sub, i))
9533 return 0;
9534 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009535 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537 }
9538
9539 return 0;
9540}
9541
Alexander Belopolsky40018472011-02-26 01:02:56 +00009542Py_ssize_t
9543PyUnicode_Tailmatch(PyObject *str,
9544 PyObject *substr,
9545 Py_ssize_t start,
9546 Py_ssize_t end,
9547 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009549 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009550 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009551
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009552 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009553}
9554
Guido van Rossumd57fd912000-03-10 22:53:23 +00009555/* Apply fixfct filter to the Unicode object self and return a
9556 reference to the modified object */
9557
Alexander Belopolsky40018472011-02-26 01:02:56 +00009558static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009559fixup(PyObject *self,
9560 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009561{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 PyObject *u;
9563 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009564 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009566 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009567 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009568 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009569 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 /* fix functions return the new maximum character in a string,
9572 if the kind of the resulting unicode object does not change,
9573 everything is fine. Otherwise we need to change the string kind
9574 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009575 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009576
9577 if (maxchar_new == 0) {
9578 /* no changes */;
9579 if (PyUnicode_CheckExact(self)) {
9580 Py_DECREF(u);
9581 Py_INCREF(self);
9582 return self;
9583 }
9584 else
9585 return u;
9586 }
9587
Victor Stinnere6abb482012-05-02 01:15:40 +02009588 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009589
Victor Stinnereaab6042011-12-11 22:22:39 +01009590 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009592
9593 /* In case the maximum character changed, we need to
9594 convert the string to the new category. */
9595 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9596 if (v == NULL) {
9597 Py_DECREF(u);
9598 return NULL;
9599 }
9600 if (maxchar_new > maxchar_old) {
9601 /* If the maxchar increased so that the kind changed, not all
9602 characters are representable anymore and we need to fix the
9603 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009604 _PyUnicode_FastCopyCharacters(v, 0,
9605 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009606 maxchar_old = fixfct(v);
9607 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608 }
9609 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009610 _PyUnicode_FastCopyCharacters(v, 0,
9611 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009613 Py_DECREF(u);
9614 assert(_PyUnicode_CheckConsistency(v, 1));
9615 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616}
9617
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009618static PyObject *
9619ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009620{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009621 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9622 char *resdata, *data = PyUnicode_DATA(self);
9623 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009624
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009625 res = PyUnicode_New(len, 127);
9626 if (res == NULL)
9627 return NULL;
9628 resdata = PyUnicode_DATA(res);
9629 if (lower)
9630 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009632 _Py_bytes_upper(resdata, data, len);
9633 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634}
9635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009637handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009639 Py_ssize_t j;
9640 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009641 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009642 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009643
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009644 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9645
9646 where ! is a negation and \p{xxx} is a character with property xxx.
9647 */
9648 for (j = i - 1; j >= 0; j--) {
9649 c = PyUnicode_READ(kind, data, j);
9650 if (!_PyUnicode_IsCaseIgnorable(c))
9651 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009653 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9654 if (final_sigma) {
9655 for (j = i + 1; j < length; j++) {
9656 c = PyUnicode_READ(kind, data, j);
9657 if (!_PyUnicode_IsCaseIgnorable(c))
9658 break;
9659 }
9660 final_sigma = j == length || !_PyUnicode_IsCased(c);
9661 }
9662 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663}
9664
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009665static int
9666lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9667 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009669 /* Obscure special case. */
9670 if (c == 0x3A3) {
9671 mapped[0] = handle_capital_sigma(kind, data, length, i);
9672 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009674 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675}
9676
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009677static Py_ssize_t
9678do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009680 Py_ssize_t i, k = 0;
9681 int n_res, j;
9682 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009683
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009684 c = PyUnicode_READ(kind, data, 0);
9685 n_res = _PyUnicode_ToUpperFull(c, mapped);
9686 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009687 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009688 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009690 for (i = 1; i < length; i++) {
9691 c = PyUnicode_READ(kind, data, i);
9692 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9693 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009694 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009695 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009696 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009697 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009698 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699}
9700
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009701static Py_ssize_t
9702do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9703 Py_ssize_t i, k = 0;
9704
9705 for (i = 0; i < length; i++) {
9706 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9707 int n_res, j;
9708 if (Py_UNICODE_ISUPPER(c)) {
9709 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9710 }
9711 else if (Py_UNICODE_ISLOWER(c)) {
9712 n_res = _PyUnicode_ToUpperFull(c, mapped);
9713 }
9714 else {
9715 n_res = 1;
9716 mapped[0] = c;
9717 }
9718 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009719 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009720 res[k++] = mapped[j];
9721 }
9722 }
9723 return k;
9724}
9725
9726static Py_ssize_t
9727do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9728 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009729{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009730 Py_ssize_t i, k = 0;
9731
9732 for (i = 0; i < length; i++) {
9733 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9734 int n_res, j;
9735 if (lower)
9736 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9737 else
9738 n_res = _PyUnicode_ToUpperFull(c, mapped);
9739 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009740 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009741 res[k++] = mapped[j];
9742 }
9743 }
9744 return k;
9745}
9746
9747static Py_ssize_t
9748do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9749{
9750 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9751}
9752
9753static Py_ssize_t
9754do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9755{
9756 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9757}
9758
Benjamin Petersone51757f2012-01-12 21:10:29 -05009759static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009760do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9761{
9762 Py_ssize_t i, k = 0;
9763
9764 for (i = 0; i < length; i++) {
9765 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9766 Py_UCS4 mapped[3];
9767 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9768 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009769 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009770 res[k++] = mapped[j];
9771 }
9772 }
9773 return k;
9774}
9775
9776static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009777do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9778{
9779 Py_ssize_t i, k = 0;
9780 int previous_is_cased;
9781
9782 previous_is_cased = 0;
9783 for (i = 0; i < length; i++) {
9784 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9785 Py_UCS4 mapped[3];
9786 int n_res, j;
9787
9788 if (previous_is_cased)
9789 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9790 else
9791 n_res = _PyUnicode_ToTitleFull(c, mapped);
9792
9793 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009794 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009795 res[k++] = mapped[j];
9796 }
9797
9798 previous_is_cased = _PyUnicode_IsCased(c);
9799 }
9800 return k;
9801}
9802
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009803static PyObject *
9804case_operation(PyObject *self,
9805 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9806{
9807 PyObject *res = NULL;
9808 Py_ssize_t length, newlength = 0;
9809 int kind, outkind;
9810 void *data, *outdata;
9811 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9812
Benjamin Petersoneea48462012-01-16 14:28:50 -05009813 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009814
9815 kind = PyUnicode_KIND(self);
9816 data = PyUnicode_DATA(self);
9817 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009818 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009819 PyErr_SetString(PyExc_OverflowError, "string is too long");
9820 return NULL;
9821 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009822 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009823 if (tmp == NULL)
9824 return PyErr_NoMemory();
9825 newlength = perform(kind, data, length, tmp, &maxchar);
9826 res = PyUnicode_New(newlength, maxchar);
9827 if (res == NULL)
9828 goto leave;
9829 tmpend = tmp + newlength;
9830 outdata = PyUnicode_DATA(res);
9831 outkind = PyUnicode_KIND(res);
9832 switch (outkind) {
9833 case PyUnicode_1BYTE_KIND:
9834 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9835 break;
9836 case PyUnicode_2BYTE_KIND:
9837 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9838 break;
9839 case PyUnicode_4BYTE_KIND:
9840 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9841 break;
9842 default:
9843 assert(0);
9844 break;
9845 }
9846 leave:
9847 PyMem_FREE(tmp);
9848 return res;
9849}
9850
Tim Peters8ce9f162004-08-27 01:49:32 +00009851PyObject *
9852PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009853{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009854 PyObject *res;
9855 PyObject *fseq;
9856 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009857 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009858
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009859 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009860 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009861 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009862 }
9863
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009864 /* NOTE: the following code can't call back into Python code,
9865 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009866 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009867
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009868 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009869 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009870 res = _PyUnicode_JoinArray(separator, items, seqlen);
9871 Py_DECREF(fseq);
9872 return res;
9873}
9874
9875PyObject *
9876_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9877{
9878 PyObject *res = NULL; /* the result */
9879 PyObject *sep = NULL;
9880 Py_ssize_t seplen;
9881 PyObject *item;
9882 Py_ssize_t sz, i, res_offset;
9883 Py_UCS4 maxchar;
9884 Py_UCS4 item_maxchar;
9885 int use_memcpy;
9886 unsigned char *res_data = NULL, *sep_data = NULL;
9887 PyObject *last_obj;
9888 unsigned int kind = 0;
9889
Tim Peters05eba1f2004-08-27 21:32:02 +00009890 /* If empty sequence, return u"". */
9891 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009892 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009893 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009894
Tim Peters05eba1f2004-08-27 21:32:02 +00009895 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009896 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009897 if (seqlen == 1) {
9898 if (PyUnicode_CheckExact(items[0])) {
9899 res = items[0];
9900 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009901 return res;
9902 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009903 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009904 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009905 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009906 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009907 /* Set up sep and seplen */
9908 if (separator == NULL) {
9909 /* fall back to a blank space separator */
9910 sep = PyUnicode_FromOrdinal(' ');
9911 if (!sep)
9912 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009913 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009914 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009915 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009916 else {
9917 if (!PyUnicode_Check(separator)) {
9918 PyErr_Format(PyExc_TypeError,
9919 "separator: expected str instance,"
9920 " %.80s found",
9921 Py_TYPE(separator)->tp_name);
9922 goto onError;
9923 }
9924 if (PyUnicode_READY(separator))
9925 goto onError;
9926 sep = separator;
9927 seplen = PyUnicode_GET_LENGTH(separator);
9928 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9929 /* inc refcount to keep this code path symmetric with the
9930 above case of a blank separator */
9931 Py_INCREF(sep);
9932 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009933 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009934 }
9935
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009936 /* There are at least two things to join, or else we have a subclass
9937 * of str in the sequence.
9938 * Do a pre-pass to figure out the total amount of space we'll
9939 * need (sz), and see whether all argument are strings.
9940 */
9941 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009942#ifdef Py_DEBUG
9943 use_memcpy = 0;
9944#else
9945 use_memcpy = 1;
9946#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009947 for (i = 0; i < seqlen; i++) {
9948 const Py_ssize_t old_sz = sz;
9949 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009950 if (!PyUnicode_Check(item)) {
9951 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009952 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009953 " %.80s found",
9954 i, Py_TYPE(item)->tp_name);
9955 goto onError;
9956 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 if (PyUnicode_READY(item) == -1)
9958 goto onError;
9959 sz += PyUnicode_GET_LENGTH(item);
9960 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009961 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009962 if (i != 0)
9963 sz += seplen;
9964 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9965 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009966 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009967 goto onError;
9968 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009969 if (use_memcpy && last_obj != NULL) {
9970 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9971 use_memcpy = 0;
9972 }
9973 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009974 }
Tim Petersced69f82003-09-16 20:30:58 +00009975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009977 if (res == NULL)
9978 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009979
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009980 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009981#ifdef Py_DEBUG
9982 use_memcpy = 0;
9983#else
9984 if (use_memcpy) {
9985 res_data = PyUnicode_1BYTE_DATA(res);
9986 kind = PyUnicode_KIND(res);
9987 if (seplen != 0)
9988 sep_data = PyUnicode_1BYTE_DATA(sep);
9989 }
9990#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009991 if (use_memcpy) {
9992 for (i = 0; i < seqlen; ++i) {
9993 Py_ssize_t itemlen;
9994 item = items[i];
9995
9996 /* Copy item, and maybe the separator. */
9997 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009998 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009999 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010000 kind * seplen);
10001 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010002 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010003
10004 itemlen = PyUnicode_GET_LENGTH(item);
10005 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010006 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010007 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010008 kind * itemlen);
10009 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010010 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010011 }
10012 assert(res_data == PyUnicode_1BYTE_DATA(res)
10013 + kind * PyUnicode_GET_LENGTH(res));
10014 }
10015 else {
10016 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10017 Py_ssize_t itemlen;
10018 item = items[i];
10019
10020 /* Copy item, and maybe the separator. */
10021 if (i && seplen != 0) {
10022 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10023 res_offset += seplen;
10024 }
10025
10026 itemlen = PyUnicode_GET_LENGTH(item);
10027 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010028 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010029 res_offset += itemlen;
10030 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010031 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010032 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010033 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010036 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038
Benjamin Peterson29060642009-01-31 22:14:21 +000010039 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010041 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010042 return NULL;
10043}
10044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045#define FILL(kind, data, value, start, length) \
10046 do { \
10047 Py_ssize_t i_ = 0; \
10048 assert(kind != PyUnicode_WCHAR_KIND); \
10049 switch ((kind)) { \
10050 case PyUnicode_1BYTE_KIND: { \
10051 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010052 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 break; \
10054 } \
10055 case PyUnicode_2BYTE_KIND: { \
10056 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10057 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10058 break; \
10059 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010060 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10062 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10063 break; \
10064 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010065 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 } \
10067 } while (0)
10068
Victor Stinnerd3f08822012-05-29 12:57:52 +020010069void
10070_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10071 Py_UCS4 fill_char)
10072{
10073 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10074 const void *data = PyUnicode_DATA(unicode);
10075 assert(PyUnicode_IS_READY(unicode));
10076 assert(unicode_modifiable(unicode));
10077 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10078 assert(start >= 0);
10079 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10080 FILL(kind, data, fill_char, start, length);
10081}
10082
Victor Stinner3fe55312012-01-04 00:33:50 +010010083Py_ssize_t
10084PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10085 Py_UCS4 fill_char)
10086{
10087 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010088
10089 if (!PyUnicode_Check(unicode)) {
10090 PyErr_BadInternalCall();
10091 return -1;
10092 }
10093 if (PyUnicode_READY(unicode) == -1)
10094 return -1;
10095 if (unicode_check_modifiable(unicode))
10096 return -1;
10097
Victor Stinnerd3f08822012-05-29 12:57:52 +020010098 if (start < 0) {
10099 PyErr_SetString(PyExc_IndexError, "string index out of range");
10100 return -1;
10101 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010102 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10103 PyErr_SetString(PyExc_ValueError,
10104 "fill character is bigger than "
10105 "the string maximum character");
10106 return -1;
10107 }
10108
10109 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10110 length = Py_MIN(maxlen, length);
10111 if (length <= 0)
10112 return 0;
10113
Victor Stinnerd3f08822012-05-29 12:57:52 +020010114 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010115 return length;
10116}
10117
Victor Stinner9310abb2011-10-05 00:59:23 +020010118static PyObject *
10119pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010120 Py_ssize_t left,
10121 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010123{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 PyObject *u;
10125 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010126 int kind;
10127 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010128
10129 if (left < 0)
10130 left = 0;
10131 if (right < 0)
10132 right = 0;
10133
Victor Stinnerc4b49542011-12-11 22:44:26 +010010134 if (left == 0 && right == 0)
10135 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10138 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010139 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10140 return NULL;
10141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010143 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010145 if (!u)
10146 return NULL;
10147
10148 kind = PyUnicode_KIND(u);
10149 data = PyUnicode_DATA(u);
10150 if (left)
10151 FILL(kind, data, fill, 0, left);
10152 if (right)
10153 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010154 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010155 assert(_PyUnicode_CheckConsistency(u, 1));
10156 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157}
10158
Alexander Belopolsky40018472011-02-26 01:02:56 +000010159PyObject *
10160PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010162 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010164 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010165 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166
Benjamin Petersonead6b532011-12-20 17:23:42 -060010167 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010169 if (PyUnicode_IS_ASCII(string))
10170 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010171 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010172 PyUnicode_GET_LENGTH(string), keepends);
10173 else
10174 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010175 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010176 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 break;
10178 case PyUnicode_2BYTE_KIND:
10179 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010180 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 PyUnicode_GET_LENGTH(string), keepends);
10182 break;
10183 case PyUnicode_4BYTE_KIND:
10184 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010185 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 PyUnicode_GET_LENGTH(string), keepends);
10187 break;
10188 default:
10189 assert(0);
10190 list = 0;
10191 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010192 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010193}
10194
Alexander Belopolsky40018472011-02-26 01:02:56 +000010195static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010196split(PyObject *self,
10197 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010198 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010200 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 void *buf1, *buf2;
10202 Py_ssize_t len1, len2;
10203 PyObject* out;
10204
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010206 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 if (PyUnicode_READY(self) == -1)
10209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010212 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010214 if (PyUnicode_IS_ASCII(self))
10215 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010216 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010217 PyUnicode_GET_LENGTH(self), maxcount
10218 );
10219 else
10220 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010221 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010222 PyUnicode_GET_LENGTH(self), maxcount
10223 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 case PyUnicode_2BYTE_KIND:
10225 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010226 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 PyUnicode_GET_LENGTH(self), maxcount
10228 );
10229 case PyUnicode_4BYTE_KIND:
10230 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010231 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 PyUnicode_GET_LENGTH(self), maxcount
10233 );
10234 default:
10235 assert(0);
10236 return NULL;
10237 }
10238
10239 if (PyUnicode_READY(substring) == -1)
10240 return NULL;
10241
10242 kind1 = PyUnicode_KIND(self);
10243 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 len1 = PyUnicode_GET_LENGTH(self);
10245 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010246 if (kind1 < kind2 || len1 < len2) {
10247 out = PyList_New(1);
10248 if (out == NULL)
10249 return NULL;
10250 Py_INCREF(self);
10251 PyList_SET_ITEM(out, 0, self);
10252 return out;
10253 }
10254 buf1 = PyUnicode_DATA(self);
10255 buf2 = PyUnicode_DATA(substring);
10256 if (kind2 != kind1) {
10257 buf2 = _PyUnicode_AsKind(substring, kind1);
10258 if (!buf2)
10259 return NULL;
10260 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010262 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010264 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10265 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010266 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010267 else
10268 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010269 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 break;
10271 case PyUnicode_2BYTE_KIND:
10272 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010273 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 break;
10275 case PyUnicode_4BYTE_KIND:
10276 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010277 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 break;
10279 default:
10280 out = NULL;
10281 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010282 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 PyMem_Free(buf2);
10284 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010285}
10286
Alexander Belopolsky40018472011-02-26 01:02:56 +000010287static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010288rsplit(PyObject *self,
10289 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010290 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010291{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010292 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 void *buf1, *buf2;
10294 Py_ssize_t len1, len2;
10295 PyObject* out;
10296
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010297 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010298 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 if (PyUnicode_READY(self) == -1)
10301 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010304 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010306 if (PyUnicode_IS_ASCII(self))
10307 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010308 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010309 PyUnicode_GET_LENGTH(self), maxcount
10310 );
10311 else
10312 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010313 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010314 PyUnicode_GET_LENGTH(self), maxcount
10315 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 case PyUnicode_2BYTE_KIND:
10317 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010318 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 PyUnicode_GET_LENGTH(self), maxcount
10320 );
10321 case PyUnicode_4BYTE_KIND:
10322 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010323 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 PyUnicode_GET_LENGTH(self), maxcount
10325 );
10326 default:
10327 assert(0);
10328 return NULL;
10329 }
10330
10331 if (PyUnicode_READY(substring) == -1)
10332 return NULL;
10333
10334 kind1 = PyUnicode_KIND(self);
10335 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 len1 = PyUnicode_GET_LENGTH(self);
10337 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010338 if (kind1 < kind2 || len1 < len2) {
10339 out = PyList_New(1);
10340 if (out == NULL)
10341 return NULL;
10342 Py_INCREF(self);
10343 PyList_SET_ITEM(out, 0, self);
10344 return out;
10345 }
10346 buf1 = PyUnicode_DATA(self);
10347 buf2 = PyUnicode_DATA(substring);
10348 if (kind2 != kind1) {
10349 buf2 = _PyUnicode_AsKind(substring, kind1);
10350 if (!buf2)
10351 return NULL;
10352 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010354 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010356 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10357 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010358 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010359 else
10360 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010361 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 break;
10363 case PyUnicode_2BYTE_KIND:
10364 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010365 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 break;
10367 case PyUnicode_4BYTE_KIND:
10368 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010369 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 break;
10371 default:
10372 out = NULL;
10373 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010374 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 PyMem_Free(buf2);
10376 return out;
10377}
10378
10379static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010380anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10381 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010383 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010385 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10386 return asciilib_find(buf1, len1, buf2, len2, offset);
10387 else
10388 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 case PyUnicode_2BYTE_KIND:
10390 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10391 case PyUnicode_4BYTE_KIND:
10392 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10393 }
10394 assert(0);
10395 return -1;
10396}
10397
10398static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010399anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10400 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010402 switch (kind) {
10403 case PyUnicode_1BYTE_KIND:
10404 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10405 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10406 else
10407 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10408 case PyUnicode_2BYTE_KIND:
10409 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10410 case PyUnicode_4BYTE_KIND:
10411 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10412 }
10413 assert(0);
10414 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010415}
10416
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010417static void
10418replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10419 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10420{
10421 int kind = PyUnicode_KIND(u);
10422 void *data = PyUnicode_DATA(u);
10423 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10424 if (kind == PyUnicode_1BYTE_KIND) {
10425 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10426 (Py_UCS1 *)data + len,
10427 u1, u2, maxcount);
10428 }
10429 else if (kind == PyUnicode_2BYTE_KIND) {
10430 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10431 (Py_UCS2 *)data + len,
10432 u1, u2, maxcount);
10433 }
10434 else {
10435 assert(kind == PyUnicode_4BYTE_KIND);
10436 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10437 (Py_UCS4 *)data + len,
10438 u1, u2, maxcount);
10439 }
10440}
10441
Alexander Belopolsky40018472011-02-26 01:02:56 +000010442static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443replace(PyObject *self, PyObject *str1,
10444 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010445{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 PyObject *u;
10447 char *sbuf = PyUnicode_DATA(self);
10448 char *buf1 = PyUnicode_DATA(str1);
10449 char *buf2 = PyUnicode_DATA(str2);
10450 int srelease = 0, release1 = 0, release2 = 0;
10451 int skind = PyUnicode_KIND(self);
10452 int kind1 = PyUnicode_KIND(str1);
10453 int kind2 = PyUnicode_KIND(str2);
10454 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10455 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10456 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010457 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010458 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010459
10460 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010461 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010463 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010464
Victor Stinner59de0ee2011-10-07 10:01:28 +020010465 if (str1 == str2)
10466 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467
Victor Stinner49a0a212011-10-12 23:46:10 +020010468 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010469 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10470 if (maxchar < maxchar_str1)
10471 /* substring too wide to be present */
10472 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010473 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10474 /* Replacing str1 with str2 may cause a maxchar reduction in the
10475 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010476 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010477 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010479 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010480 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010482 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010484 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010485 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010486 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010487
Victor Stinner69ed0f42013-04-09 21:48:24 +020010488 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010489 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010490 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010491 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010492 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010494 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010496
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010497 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10498 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010499 }
10500 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 int rkind = skind;
10502 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010503 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 if (kind1 < rkind) {
10506 /* widen substring */
10507 buf1 = _PyUnicode_AsKind(str1, rkind);
10508 if (!buf1) goto error;
10509 release1 = 1;
10510 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010511 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010512 if (i < 0)
10513 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 if (rkind > kind2) {
10515 /* widen replacement */
10516 buf2 = _PyUnicode_AsKind(str2, rkind);
10517 if (!buf2) goto error;
10518 release2 = 1;
10519 }
10520 else if (rkind < kind2) {
10521 /* widen self and buf1 */
10522 rkind = kind2;
10523 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010524 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 sbuf = _PyUnicode_AsKind(self, rkind);
10526 if (!sbuf) goto error;
10527 srelease = 1;
10528 buf1 = _PyUnicode_AsKind(str1, rkind);
10529 if (!buf1) goto error;
10530 release1 = 1;
10531 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010532 u = PyUnicode_New(slen, maxchar);
10533 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010535 assert(PyUnicode_KIND(u) == rkind);
10536 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010537
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010538 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010539 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010540 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010542 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010544
10545 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010546 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010547 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010548 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010549 if (i == -1)
10550 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010551 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010553 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010555 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010556 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010557 }
10558 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010560 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 int rkind = skind;
10562 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010565 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 buf1 = _PyUnicode_AsKind(str1, rkind);
10567 if (!buf1) goto error;
10568 release1 = 1;
10569 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010570 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010571 if (n == 0)
10572 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010574 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 buf2 = _PyUnicode_AsKind(str2, rkind);
10576 if (!buf2) goto error;
10577 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010578 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010580 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 rkind = kind2;
10582 sbuf = _PyUnicode_AsKind(self, rkind);
10583 if (!sbuf) goto error;
10584 srelease = 1;
10585 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010586 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 buf1 = _PyUnicode_AsKind(str1, rkind);
10588 if (!buf1) goto error;
10589 release1 = 1;
10590 }
10591 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10592 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010593 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 PyErr_SetString(PyExc_OverflowError,
10595 "replace string is too long");
10596 goto error;
10597 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010598 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010599 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010600 _Py_INCREF_UNICODE_EMPTY();
10601 if (!unicode_empty)
10602 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010603 u = unicode_empty;
10604 goto done;
10605 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010606 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 PyErr_SetString(PyExc_OverflowError,
10608 "replace string is too long");
10609 goto error;
10610 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010611 u = PyUnicode_New(new_size, maxchar);
10612 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010614 assert(PyUnicode_KIND(u) == rkind);
10615 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 ires = i = 0;
10617 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010618 while (n-- > 0) {
10619 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010620 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010621 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010622 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010623 if (j == -1)
10624 break;
10625 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010626 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010627 memcpy(res + rkind * ires,
10628 sbuf + rkind * i,
10629 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010631 }
10632 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010634 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010636 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010638 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010642 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010643 memcpy(res + rkind * ires,
10644 sbuf + rkind * i,
10645 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010646 }
10647 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010648 /* interleave */
10649 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010650 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010652 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010654 if (--n <= 0)
10655 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010656 memcpy(res + rkind * ires,
10657 sbuf + rkind * i,
10658 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 ires++;
10660 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010661 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010662 memcpy(res + rkind * ires,
10663 sbuf + rkind * i,
10664 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010665 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010666 }
10667
10668 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010669 unicode_adjust_maxchar(&u);
10670 if (u == NULL)
10671 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010673
10674 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 if (srelease)
10676 PyMem_FREE(sbuf);
10677 if (release1)
10678 PyMem_FREE(buf1);
10679 if (release2)
10680 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010681 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010683
Benjamin Peterson29060642009-01-31 22:14:21 +000010684 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010685 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 if (srelease)
10687 PyMem_FREE(sbuf);
10688 if (release1)
10689 PyMem_FREE(buf1);
10690 if (release2)
10691 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010692 return unicode_result_unchanged(self);
10693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 error:
10695 if (srelease && sbuf)
10696 PyMem_FREE(sbuf);
10697 if (release1 && buf1)
10698 PyMem_FREE(buf1);
10699 if (release2 && buf2)
10700 PyMem_FREE(buf2);
10701 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702}
10703
10704/* --- Unicode Object Methods --------------------------------------------- */
10705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010706PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010707 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708\n\
10709Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010710characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711
10712static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010713unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010715 if (PyUnicode_READY(self) == -1)
10716 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010717 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718}
10719
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010720PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010721 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722\n\
10723Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010724have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010725
10726static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010727unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010729 if (PyUnicode_READY(self) == -1)
10730 return NULL;
10731 if (PyUnicode_GET_LENGTH(self) == 0)
10732 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010733 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734}
10735
Benjamin Petersond5890c82012-01-14 13:23:30 -050010736PyDoc_STRVAR(casefold__doc__,
10737 "S.casefold() -> str\n\
10738\n\
10739Return a version of S suitable for caseless comparisons.");
10740
10741static PyObject *
10742unicode_casefold(PyObject *self)
10743{
10744 if (PyUnicode_READY(self) == -1)
10745 return NULL;
10746 if (PyUnicode_IS_ASCII(self))
10747 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010748 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010749}
10750
10751
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010752/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010753
10754static int
10755convert_uc(PyObject *obj, void *addr)
10756{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010758
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010759 if (!PyUnicode_Check(obj)) {
10760 PyErr_Format(PyExc_TypeError,
10761 "The fill character must be a unicode character, "
10762 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010763 return 0;
10764 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010765 if (PyUnicode_READY(obj) < 0)
10766 return 0;
10767 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010768 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010769 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010770 return 0;
10771 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010772 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010773 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010774}
10775
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010776PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010777 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010779Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010780done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781
10782static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010783unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010785 Py_ssize_t marg, left;
10786 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787 Py_UCS4 fillchar = ' ';
10788
Victor Stinnere9a29352011-10-01 02:14:59 +020010789 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010791
Benjamin Petersonbac79492012-01-14 13:34:47 -050010792 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010793 return NULL;
10794
Victor Stinnerc4b49542011-12-11 22:44:26 +010010795 if (PyUnicode_GET_LENGTH(self) >= width)
10796 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797
Victor Stinnerc4b49542011-12-11 22:44:26 +010010798 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010799 left = marg / 2 + (marg & width & 1);
10800
Victor Stinner9310abb2011-10-05 00:59:23 +020010801 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010802}
10803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804/* This function assumes that str1 and str2 are readied by the caller. */
10805
Marc-André Lemburge5034372000-08-08 08:04:29 +000010806static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010807unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010808{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010809#define COMPARE(TYPE1, TYPE2) \
10810 do { \
10811 TYPE1* p1 = (TYPE1 *)data1; \
10812 TYPE2* p2 = (TYPE2 *)data2; \
10813 TYPE1* end = p1 + len; \
10814 Py_UCS4 c1, c2; \
10815 for (; p1 != end; p1++, p2++) { \
10816 c1 = *p1; \
10817 c2 = *p2; \
10818 if (c1 != c2) \
10819 return (c1 < c2) ? -1 : 1; \
10820 } \
10821 } \
10822 while (0)
10823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824 int kind1, kind2;
10825 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010826 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 kind1 = PyUnicode_KIND(str1);
10829 kind2 = PyUnicode_KIND(str2);
10830 data1 = PyUnicode_DATA(str1);
10831 data2 = PyUnicode_DATA(str2);
10832 len1 = PyUnicode_GET_LENGTH(str1);
10833 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010834 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010835
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010836 switch(kind1) {
10837 case PyUnicode_1BYTE_KIND:
10838 {
10839 switch(kind2) {
10840 case PyUnicode_1BYTE_KIND:
10841 {
10842 int cmp = memcmp(data1, data2, len);
10843 /* normalize result of memcmp() into the range [-1; 1] */
10844 if (cmp < 0)
10845 return -1;
10846 if (cmp > 0)
10847 return 1;
10848 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010849 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010850 case PyUnicode_2BYTE_KIND:
10851 COMPARE(Py_UCS1, Py_UCS2);
10852 break;
10853 case PyUnicode_4BYTE_KIND:
10854 COMPARE(Py_UCS1, Py_UCS4);
10855 break;
10856 default:
10857 assert(0);
10858 }
10859 break;
10860 }
10861 case PyUnicode_2BYTE_KIND:
10862 {
10863 switch(kind2) {
10864 case PyUnicode_1BYTE_KIND:
10865 COMPARE(Py_UCS2, Py_UCS1);
10866 break;
10867 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010868 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010869 COMPARE(Py_UCS2, Py_UCS2);
10870 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010871 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010872 case PyUnicode_4BYTE_KIND:
10873 COMPARE(Py_UCS2, Py_UCS4);
10874 break;
10875 default:
10876 assert(0);
10877 }
10878 break;
10879 }
10880 case PyUnicode_4BYTE_KIND:
10881 {
10882 switch(kind2) {
10883 case PyUnicode_1BYTE_KIND:
10884 COMPARE(Py_UCS4, Py_UCS1);
10885 break;
10886 case PyUnicode_2BYTE_KIND:
10887 COMPARE(Py_UCS4, Py_UCS2);
10888 break;
10889 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010890 {
10891#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10892 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10893 /* normalize result of wmemcmp() into the range [-1; 1] */
10894 if (cmp < 0)
10895 return -1;
10896 if (cmp > 0)
10897 return 1;
10898#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010899 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010900#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010901 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010902 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010903 default:
10904 assert(0);
10905 }
10906 break;
10907 }
10908 default:
10909 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010910 }
10911
Victor Stinner770e19e2012-10-04 22:59:45 +020010912 if (len1 == len2)
10913 return 0;
10914 if (len1 < len2)
10915 return -1;
10916 else
10917 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010918
10919#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010920}
10921
Benjamin Peterson621b4302016-09-09 13:54:34 -070010922static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010923unicode_compare_eq(PyObject *str1, PyObject *str2)
10924{
10925 int kind;
10926 void *data1, *data2;
10927 Py_ssize_t len;
10928 int cmp;
10929
Victor Stinnere5567ad2012-10-23 02:48:49 +020010930 len = PyUnicode_GET_LENGTH(str1);
10931 if (PyUnicode_GET_LENGTH(str2) != len)
10932 return 0;
10933 kind = PyUnicode_KIND(str1);
10934 if (PyUnicode_KIND(str2) != kind)
10935 return 0;
10936 data1 = PyUnicode_DATA(str1);
10937 data2 = PyUnicode_DATA(str2);
10938
10939 cmp = memcmp(data1, data2, len * kind);
10940 return (cmp == 0);
10941}
10942
10943
Alexander Belopolsky40018472011-02-26 01:02:56 +000010944int
10945PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10948 if (PyUnicode_READY(left) == -1 ||
10949 PyUnicode_READY(right) == -1)
10950 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010951
10952 /* a string is equal to itself */
10953 if (left == right)
10954 return 0;
10955
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010956 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010958 PyErr_Format(PyExc_TypeError,
10959 "Can't compare %.100s and %.100s",
10960 left->ob_type->tp_name,
10961 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962 return -1;
10963}
10964
Martin v. Löwis5b222132007-06-10 09:51:05 +000010965int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010966_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10967{
10968 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10969 if (right_str == NULL)
10970 return -1;
10971 return PyUnicode_Compare(left, right_str);
10972}
10973
10974int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010975PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10976{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010977 Py_ssize_t i;
10978 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979 Py_UCS4 chr;
10980
Victor Stinner910337b2011-10-03 03:20:16 +020010981 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 if (PyUnicode_READY(uni) == -1)
10983 return -1;
10984 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010985 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010986 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010987 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010988 size_t len, len2 = strlen(str);
10989 int cmp;
10990
10991 len = Py_MIN(len1, len2);
10992 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010993 if (cmp != 0) {
10994 if (cmp < 0)
10995 return -1;
10996 else
10997 return 1;
10998 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010999 if (len1 > len2)
11000 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011001 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011002 return -1; /* str is longer */
11003 return 0;
11004 }
11005 else {
11006 void *data = PyUnicode_DATA(uni);
11007 /* Compare Unicode string and source character set string */
11008 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011009 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011010 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11011 /* This check keeps Python strings that end in '\0' from comparing equal
11012 to C strings identical up to that point. */
11013 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11014 return 1; /* uni is longer */
11015 if (str[i])
11016 return -1; /* str is longer */
11017 return 0;
11018 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011019}
11020
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011021
Benjamin Peterson29060642009-01-31 22:14:21 +000011022#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011023 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011024
Alexander Belopolsky40018472011-02-26 01:02:56 +000011025PyObject *
11026PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011027{
11028 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011029 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011030
Victor Stinnere5567ad2012-10-23 02:48:49 +020011031 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11032 Py_RETURN_NOTIMPLEMENTED;
11033
11034 if (PyUnicode_READY(left) == -1 ||
11035 PyUnicode_READY(right) == -1)
11036 return NULL;
11037
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011038 if (left == right) {
11039 switch (op) {
11040 case Py_EQ:
11041 case Py_LE:
11042 case Py_GE:
11043 /* a string is equal to itself */
11044 v = Py_True;
11045 break;
11046 case Py_NE:
11047 case Py_LT:
11048 case Py_GT:
11049 v = Py_False;
11050 break;
11051 default:
11052 PyErr_BadArgument();
11053 return NULL;
11054 }
11055 }
11056 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011057 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011058 result ^= (op == Py_NE);
11059 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011060 }
11061 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011062 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011063
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011064 /* Convert the return value to a Boolean */
11065 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011066 case Py_LE:
11067 v = TEST_COND(result <= 0);
11068 break;
11069 case Py_GE:
11070 v = TEST_COND(result >= 0);
11071 break;
11072 case Py_LT:
11073 v = TEST_COND(result == -1);
11074 break;
11075 case Py_GT:
11076 v = TEST_COND(result == 1);
11077 break;
11078 default:
11079 PyErr_BadArgument();
11080 return NULL;
11081 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011082 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011083 Py_INCREF(v);
11084 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011085}
11086
Alexander Belopolsky40018472011-02-26 01:02:56 +000011087int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011088_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11089{
11090 return unicode_eq(aa, bb);
11091}
11092
11093int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011094PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011095{
Victor Stinner77282cb2013-04-14 19:22:47 +020011096 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097 void *buf1, *buf2;
11098 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011099 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011100
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011101 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011102 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011103 "'in <string>' requires string as left operand, not %.100s",
11104 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011105 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011106 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011107 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011108 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011109 if (ensure_unicode(str) < 0)
11110 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011112 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011113 kind2 = PyUnicode_KIND(substr);
11114 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011115 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011117 len2 = PyUnicode_GET_LENGTH(substr);
11118 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011119 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011120 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011121 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011122 if (len2 == 1) {
11123 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11124 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011125 return result;
11126 }
11127 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011128 buf2 = _PyUnicode_AsKind(substr, kind1);
11129 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011130 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011131 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132
Victor Stinner77282cb2013-04-14 19:22:47 +020011133 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134 case PyUnicode_1BYTE_KIND:
11135 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11136 break;
11137 case PyUnicode_2BYTE_KIND:
11138 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11139 break;
11140 case PyUnicode_4BYTE_KIND:
11141 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11142 break;
11143 default:
11144 result = -1;
11145 assert(0);
11146 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011147
Victor Stinner77282cb2013-04-14 19:22:47 +020011148 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 PyMem_Free(buf2);
11150
Guido van Rossum403d68b2000-03-13 15:55:09 +000011151 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011152}
11153
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154/* Concat to string or Unicode object giving a new Unicode object. */
11155
Alexander Belopolsky40018472011-02-26 01:02:56 +000011156PyObject *
11157PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011159 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011160 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011161 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011163 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11164 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011165
11166 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011167 if (left == unicode_empty)
11168 return PyUnicode_FromObject(right);
11169 if (right == unicode_empty)
11170 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011172 left_len = PyUnicode_GET_LENGTH(left);
11173 right_len = PyUnicode_GET_LENGTH(right);
11174 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011175 PyErr_SetString(PyExc_OverflowError,
11176 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011177 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011178 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011179 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011180
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011181 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11182 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011183 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011186 result = PyUnicode_New(new_len, maxchar);
11187 if (result == NULL)
11188 return NULL;
11189 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11190 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11191 assert(_PyUnicode_CheckConsistency(result, 1));
11192 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193}
11194
Walter Dörwald1ab83302007-05-18 17:15:44 +000011195void
Victor Stinner23e56682011-10-03 03:54:37 +020011196PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011197{
Victor Stinner23e56682011-10-03 03:54:37 +020011198 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011199 Py_UCS4 maxchar, maxchar2;
11200 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011201
11202 if (p_left == NULL) {
11203 if (!PyErr_Occurred())
11204 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011205 return;
11206 }
Victor Stinner23e56682011-10-03 03:54:37 +020011207 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011208 if (right == NULL || left == NULL
11209 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011210 if (!PyErr_Occurred())
11211 PyErr_BadInternalCall();
11212 goto error;
11213 }
11214
Benjamin Petersonbac79492012-01-14 13:34:47 -050011215 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011216 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011217 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011218 goto error;
11219
Victor Stinner488fa492011-12-12 00:01:39 +010011220 /* Shortcuts */
11221 if (left == unicode_empty) {
11222 Py_DECREF(left);
11223 Py_INCREF(right);
11224 *p_left = right;
11225 return;
11226 }
11227 if (right == unicode_empty)
11228 return;
11229
11230 left_len = PyUnicode_GET_LENGTH(left);
11231 right_len = PyUnicode_GET_LENGTH(right);
11232 if (left_len > PY_SSIZE_T_MAX - right_len) {
11233 PyErr_SetString(PyExc_OverflowError,
11234 "strings are too large to concat");
11235 goto error;
11236 }
11237 new_len = left_len + right_len;
11238
11239 if (unicode_modifiable(left)
11240 && PyUnicode_CheckExact(right)
11241 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011242 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11243 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011244 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011245 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011246 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11247 {
11248 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011249 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011250 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011251
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011252 /* copy 'right' into the newly allocated area of 'left' */
11253 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011254 }
Victor Stinner488fa492011-12-12 00:01:39 +010011255 else {
11256 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11257 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011258 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011259
Victor Stinner488fa492011-12-12 00:01:39 +010011260 /* Concat the two Unicode strings */
11261 res = PyUnicode_New(new_len, maxchar);
11262 if (res == NULL)
11263 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011264 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11265 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011266 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011267 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011268 }
11269 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011270 return;
11271
11272error:
Victor Stinner488fa492011-12-12 00:01:39 +010011273 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011274}
11275
11276void
11277PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11278{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011279 PyUnicode_Append(pleft, right);
11280 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011281}
11282
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011283/*
11284Wraps stringlib_parse_args_finds() and additionally ensures that the
11285first argument is a unicode object.
11286*/
11287
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011288static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011289parse_args_finds_unicode(const char * function_name, PyObject *args,
11290 PyObject **substring,
11291 Py_ssize_t *start, Py_ssize_t *end)
11292{
11293 if(stringlib_parse_args_finds(function_name, args, substring,
11294 start, end)) {
11295 if (ensure_unicode(*substring) < 0)
11296 return 0;
11297 return 1;
11298 }
11299 return 0;
11300}
11301
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011302PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011303 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011305Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011306string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011307interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308
11309static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011310unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011312 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011313 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011314 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011316 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 void *buf1, *buf2;
11318 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011320 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011321 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011323 kind1 = PyUnicode_KIND(self);
11324 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011325 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011326 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328 len1 = PyUnicode_GET_LENGTH(self);
11329 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011331 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011332 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011333
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011334 buf1 = PyUnicode_DATA(self);
11335 buf2 = PyUnicode_DATA(substring);
11336 if (kind2 != kind1) {
11337 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011338 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011339 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011340 }
11341 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 case PyUnicode_1BYTE_KIND:
11343 iresult = ucs1lib_count(
11344 ((Py_UCS1*)buf1) + start, end - start,
11345 buf2, len2, PY_SSIZE_T_MAX
11346 );
11347 break;
11348 case PyUnicode_2BYTE_KIND:
11349 iresult = ucs2lib_count(
11350 ((Py_UCS2*)buf1) + start, end - start,
11351 buf2, len2, PY_SSIZE_T_MAX
11352 );
11353 break;
11354 case PyUnicode_4BYTE_KIND:
11355 iresult = ucs4lib_count(
11356 ((Py_UCS4*)buf1) + start, end - start,
11357 buf2, len2, PY_SSIZE_T_MAX
11358 );
11359 break;
11360 default:
11361 assert(0); iresult = 0;
11362 }
11363
11364 result = PyLong_FromSsize_t(iresult);
11365
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011366 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011367 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369 return result;
11370}
11371
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011372PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011373 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011375Encode S using the codec registered for encoding. Default encoding\n\
11376is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011377handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011378a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11379'xmlcharrefreplace' as well as any other name registered with\n\
11380codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381
11382static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011383unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011384{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011385 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386 char *encoding = NULL;
11387 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011388
Benjamin Peterson308d6372009-09-18 21:42:35 +000011389 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11390 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011392 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011393}
11394
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011395PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011396 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397\n\
11398Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011399If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011400
11401static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011402unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011404 Py_ssize_t i, j, line_pos, src_len, incr;
11405 Py_UCS4 ch;
11406 PyObject *u;
11407 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011408 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011410 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011411 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412
Ezio Melotti745d54d2013-11-16 19:10:57 +020011413 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11414 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416
Antoine Pitrou22425222011-10-04 19:10:51 +020011417 if (PyUnicode_READY(self) == -1)
11418 return NULL;
11419
Thomas Wouters7e474022000-07-16 12:04:32 +000011420 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011421 src_len = PyUnicode_GET_LENGTH(self);
11422 i = j = line_pos = 0;
11423 kind = PyUnicode_KIND(self);
11424 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011425 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011426 for (; i < src_len; i++) {
11427 ch = PyUnicode_READ(kind, src_data, i);
11428 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011429 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011430 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011431 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011432 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011433 goto overflow;
11434 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011435 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011436 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011439 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011440 goto overflow;
11441 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011443 if (ch == '\n' || ch == '\r')
11444 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011446 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011447 if (!found)
11448 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011449
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011451 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452 if (!u)
11453 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011454 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455
Antoine Pitroue71d5742011-10-04 15:55:09 +020011456 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457
Antoine Pitroue71d5742011-10-04 15:55:09 +020011458 for (; i < src_len; i++) {
11459 ch = PyUnicode_READ(kind, src_data, i);
11460 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011461 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011462 incr = tabsize - (line_pos % tabsize);
11463 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011464 FILL(kind, dest_data, ' ', j, incr);
11465 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011466 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011467 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011468 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011469 line_pos++;
11470 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011471 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011472 if (ch == '\n' || ch == '\r')
11473 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011475 }
11476 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011477 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011478
Antoine Pitroue71d5742011-10-04 15:55:09 +020011479 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011480 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11481 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482}
11483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011484PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011485 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486\n\
11487Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011488such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489arguments start and end are interpreted as in slice notation.\n\
11490\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011491Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492
11493static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011496 /* initialize variables to prevent gcc warning */
11497 PyObject *substring = NULL;
11498 Py_ssize_t start = 0;
11499 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011500 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011502 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011505 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011506 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011507
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011508 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011510 if (result == -2)
11511 return NULL;
11512
Christian Heimes217cfd12007-12-02 14:31:20 +000011513 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514}
11515
11516static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011517unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011519 void *data;
11520 enum PyUnicode_Kind kind;
11521 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011522
11523 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11524 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011526 }
11527 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11528 PyErr_SetString(PyExc_IndexError, "string index out of range");
11529 return NULL;
11530 }
11531 kind = PyUnicode_KIND(self);
11532 data = PyUnicode_DATA(self);
11533 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011534 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535}
11536
Guido van Rossumc2504932007-09-18 19:42:40 +000011537/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011538 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011539static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011540unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541{
Guido van Rossumc2504932007-09-18 19:42:40 +000011542 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011543 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011544
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011545#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011546 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011547#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 if (_PyUnicode_HASH(self) != -1)
11549 return _PyUnicode_HASH(self);
11550 if (PyUnicode_READY(self) == -1)
11551 return -1;
11552 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011553 /*
11554 We make the hash of the empty string be 0, rather than using
11555 (prefix ^ suffix), since this slightly obfuscates the hash secret
11556 */
11557 if (len == 0) {
11558 _PyUnicode_HASH(self) = 0;
11559 return 0;
11560 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011561 x = _Py_HashBytes(PyUnicode_DATA(self),
11562 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011563 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011564 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565}
11566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011567PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011568 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011570Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571
11572static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011573unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011575 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011576 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011577 PyObject *substring = NULL;
11578 Py_ssize_t start = 0;
11579 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011581 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011584 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011585 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011587 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011588
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589 if (result == -2)
11590 return NULL;
11591
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592 if (result < 0) {
11593 PyErr_SetString(PyExc_ValueError, "substring not found");
11594 return NULL;
11595 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011596
Christian Heimes217cfd12007-12-02 14:31:20 +000011597 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598}
11599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011600PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011601 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011603Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011604at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
11606static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011607unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 Py_ssize_t i, length;
11610 int kind;
11611 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612 int cased;
11613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 if (PyUnicode_READY(self) == -1)
11615 return NULL;
11616 length = PyUnicode_GET_LENGTH(self);
11617 kind = PyUnicode_KIND(self);
11618 data = PyUnicode_DATA(self);
11619
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 if (length == 1)
11622 return PyBool_FromLong(
11623 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011625 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011627 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011628
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011630 for (i = 0; i < length; i++) {
11631 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011632
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11634 return PyBool_FromLong(0);
11635 else if (!cased && Py_UNICODE_ISLOWER(ch))
11636 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011638 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639}
11640
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011641PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011644Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011645at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646
11647static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011648unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 Py_ssize_t i, length;
11651 int kind;
11652 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653 int cased;
11654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 if (PyUnicode_READY(self) == -1)
11656 return NULL;
11657 length = PyUnicode_GET_LENGTH(self);
11658 kind = PyUnicode_KIND(self);
11659 data = PyUnicode_DATA(self);
11660
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 if (length == 1)
11663 return PyBool_FromLong(
11664 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011666 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011668 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011669
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671 for (i = 0; i < length; i++) {
11672 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011673
Benjamin Peterson29060642009-01-31 22:14:21 +000011674 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11675 return PyBool_FromLong(0);
11676 else if (!cased && Py_UNICODE_ISUPPER(ch))
11677 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011679 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680}
11681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011682PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011683 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011685Return True if S is a titlecased string and there is at least one\n\
11686character in S, i.e. upper- and titlecase characters may only\n\
11687follow uncased characters and lowercase characters only cased ones.\n\
11688Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689
11690static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011691unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 Py_ssize_t i, length;
11694 int kind;
11695 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696 int cased, previous_is_cased;
11697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 if (PyUnicode_READY(self) == -1)
11699 return NULL;
11700 length = PyUnicode_GET_LENGTH(self);
11701 kind = PyUnicode_KIND(self);
11702 data = PyUnicode_DATA(self);
11703
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 if (length == 1) {
11706 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11707 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11708 (Py_UNICODE_ISUPPER(ch) != 0));
11709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011711 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011713 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011714
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715 cased = 0;
11716 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717 for (i = 0; i < length; i++) {
11718 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011719
Benjamin Peterson29060642009-01-31 22:14:21 +000011720 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11721 if (previous_is_cased)
11722 return PyBool_FromLong(0);
11723 previous_is_cased = 1;
11724 cased = 1;
11725 }
11726 else if (Py_UNICODE_ISLOWER(ch)) {
11727 if (!previous_is_cased)
11728 return PyBool_FromLong(0);
11729 previous_is_cased = 1;
11730 cased = 1;
11731 }
11732 else
11733 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011735 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736}
11737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011738PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011739 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011741Return True if all characters in S are whitespace\n\
11742and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743
11744static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011745unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 Py_ssize_t i, length;
11748 int kind;
11749 void *data;
11750
11751 if (PyUnicode_READY(self) == -1)
11752 return NULL;
11753 length = PyUnicode_GET_LENGTH(self);
11754 kind = PyUnicode_KIND(self);
11755 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 if (length == 1)
11759 return PyBool_FromLong(
11760 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011762 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011764 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 for (i = 0; i < length; i++) {
11767 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011768 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011769 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011771 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772}
11773
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011774PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011775 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011776\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011777Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011778and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011779
11780static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011781unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011782{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 Py_ssize_t i, length;
11784 int kind;
11785 void *data;
11786
11787 if (PyUnicode_READY(self) == -1)
11788 return NULL;
11789 length = PyUnicode_GET_LENGTH(self);
11790 kind = PyUnicode_KIND(self);
11791 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011792
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011793 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 if (length == 1)
11795 return PyBool_FromLong(
11796 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011797
11798 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011800 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 for (i = 0; i < length; i++) {
11803 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011804 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011805 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011806 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011807}
11808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011809PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011811\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011812Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011813and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011814
11815static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011816unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011817{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 int kind;
11819 void *data;
11820 Py_ssize_t len, i;
11821
11822 if (PyUnicode_READY(self) == -1)
11823 return NULL;
11824
11825 kind = PyUnicode_KIND(self);
11826 data = PyUnicode_DATA(self);
11827 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011828
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011829 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 if (len == 1) {
11831 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11832 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11833 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011834
11835 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011837 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011839 for (i = 0; i < len; i++) {
11840 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011841 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011842 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011843 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011844 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011845}
11846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011847PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011850Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011851False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852
11853static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011854unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 Py_ssize_t i, length;
11857 int kind;
11858 void *data;
11859
11860 if (PyUnicode_READY(self) == -1)
11861 return NULL;
11862 length = PyUnicode_GET_LENGTH(self);
11863 kind = PyUnicode_KIND(self);
11864 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867 if (length == 1)
11868 return PyBool_FromLong(
11869 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011871 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011873 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 for (i = 0; i < length; i++) {
11876 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011879 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880}
11881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011882PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011885Return True if all characters in S are digits\n\
11886and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887
11888static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011889unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 Py_ssize_t i, length;
11892 int kind;
11893 void *data;
11894
11895 if (PyUnicode_READY(self) == -1)
11896 return NULL;
11897 length = PyUnicode_GET_LENGTH(self);
11898 kind = PyUnicode_KIND(self);
11899 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 if (length == 1) {
11903 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11904 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11905 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011907 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011909 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 for (i = 0; i < length; i++) {
11912 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011913 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011915 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916}
11917
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011918PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011919 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011921Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011922False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923
11924static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011925unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 Py_ssize_t i, length;
11928 int kind;
11929 void *data;
11930
11931 if (PyUnicode_READY(self) == -1)
11932 return NULL;
11933 length = PyUnicode_GET_LENGTH(self);
11934 kind = PyUnicode_KIND(self);
11935 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 if (length == 1)
11939 return PyBool_FromLong(
11940 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011942 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 for (i = 0; i < length; i++) {
11947 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011948 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011950 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951}
11952
Martin v. Löwis47383402007-08-15 07:32:56 +000011953int
11954PyUnicode_IsIdentifier(PyObject *self)
11955{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 int kind;
11957 void *data;
11958 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011959 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961 if (PyUnicode_READY(self) == -1) {
11962 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011963 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 }
11965
11966 /* Special case for empty strings */
11967 if (PyUnicode_GET_LENGTH(self) == 0)
11968 return 0;
11969 kind = PyUnicode_KIND(self);
11970 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011971
11972 /* PEP 3131 says that the first character must be in
11973 XID_Start and subsequent characters in XID_Continue,
11974 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011975 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011976 letters, digits, underscore). However, given the current
11977 definition of XID_Start and XID_Continue, it is sufficient
11978 to check just for these, except that _ must be allowed
11979 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011981 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011982 return 0;
11983
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011984 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011987 return 1;
11988}
11989
11990PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011991 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011992\n\
11993Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011994to the language definition.\n\
11995\n\
11996Use keyword.iskeyword() to test for reserved identifiers\n\
11997such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011998
11999static PyObject*
12000unicode_isidentifier(PyObject *self)
12001{
12002 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12003}
12004
Georg Brandl559e5d72008-06-11 18:37:52 +000012005PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012006 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012007\n\
12008Return True if all characters in S are considered\n\
12009printable in repr() or S is empty, False otherwise.");
12010
12011static PyObject*
12012unicode_isprintable(PyObject *self)
12013{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 Py_ssize_t i, length;
12015 int kind;
12016 void *data;
12017
12018 if (PyUnicode_READY(self) == -1)
12019 return NULL;
12020 length = PyUnicode_GET_LENGTH(self);
12021 kind = PyUnicode_KIND(self);
12022 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012023
12024 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025 if (length == 1)
12026 return PyBool_FromLong(
12027 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012029 for (i = 0; i < length; i++) {
12030 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012031 Py_RETURN_FALSE;
12032 }
12033 }
12034 Py_RETURN_TRUE;
12035}
12036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012037PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012038 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039\n\
12040Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012041iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042
12043static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012044unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012046 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047}
12048
Martin v. Löwis18e16552006-02-15 17:27:45 +000012049static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012050unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 if (PyUnicode_READY(self) == -1)
12053 return -1;
12054 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055}
12056
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012057PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012058 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012060Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012061done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062
12063static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012064unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012066 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 Py_UCS4 fillchar = ' ';
12068
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012069 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070 return NULL;
12071
Benjamin Petersonbac79492012-01-14 13:34:47 -050012072 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012073 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074
Victor Stinnerc4b49542011-12-11 22:44:26 +010012075 if (PyUnicode_GET_LENGTH(self) >= width)
12076 return unicode_result_unchanged(self);
12077
12078 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079}
12080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012081PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012082 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012084Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085
12086static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012087unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012089 if (PyUnicode_READY(self) == -1)
12090 return NULL;
12091 if (PyUnicode_IS_ASCII(self))
12092 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012093 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094}
12095
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012096#define LEFTSTRIP 0
12097#define RIGHTSTRIP 1
12098#define BOTHSTRIP 2
12099
12100/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012101static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012102
12103#define STRIPNAME(i) (stripformat[i]+3)
12104
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012105/* externally visible for str.strip(unicode) */
12106PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012107_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012108{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 void *data;
12110 int kind;
12111 Py_ssize_t i, j, len;
12112 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012113 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12116 return NULL;
12117
12118 kind = PyUnicode_KIND(self);
12119 data = PyUnicode_DATA(self);
12120 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012121 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12123 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012124 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012125
Benjamin Peterson14339b62009-01-31 16:36:08 +000012126 i = 0;
12127 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012128 while (i < len) {
12129 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12130 if (!BLOOM(sepmask, ch))
12131 break;
12132 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12133 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012134 i++;
12135 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012136 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012137
Benjamin Peterson14339b62009-01-31 16:36:08 +000012138 j = len;
12139 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012140 j--;
12141 while (j >= i) {
12142 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12143 if (!BLOOM(sepmask, ch))
12144 break;
12145 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12146 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012147 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012148 }
12149
Benjamin Peterson29060642009-01-31 22:14:21 +000012150 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012151 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012152
Victor Stinner7931d9a2011-11-04 00:22:48 +010012153 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154}
12155
12156PyObject*
12157PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12158{
12159 unsigned char *data;
12160 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012161 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162
Victor Stinnerde636f32011-10-01 03:55:54 +020012163 if (PyUnicode_READY(self) == -1)
12164 return NULL;
12165
Victor Stinner684d5fd2012-05-03 02:32:34 +020012166 length = PyUnicode_GET_LENGTH(self);
12167 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012168
Victor Stinner684d5fd2012-05-03 02:32:34 +020012169 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012170 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012171
Victor Stinnerde636f32011-10-01 03:55:54 +020012172 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012173 PyErr_SetString(PyExc_IndexError, "string index out of range");
12174 return NULL;
12175 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012176 if (start >= length || end < start)
12177 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012178
Victor Stinner684d5fd2012-05-03 02:32:34 +020012179 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012180 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012181 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012182 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012183 }
12184 else {
12185 kind = PyUnicode_KIND(self);
12186 data = PyUnicode_1BYTE_DATA(self);
12187 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012188 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012189 length);
12190 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192
12193static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012194do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 Py_ssize_t len, i, j;
12197
12198 if (PyUnicode_READY(self) == -1)
12199 return NULL;
12200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012202
Victor Stinnercc7af722013-04-09 22:39:24 +020012203 if (PyUnicode_IS_ASCII(self)) {
12204 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12205
12206 i = 0;
12207 if (striptype != RIGHTSTRIP) {
12208 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012209 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012210 if (!_Py_ascii_whitespace[ch])
12211 break;
12212 i++;
12213 }
12214 }
12215
12216 j = len;
12217 if (striptype != LEFTSTRIP) {
12218 j--;
12219 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012220 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012221 if (!_Py_ascii_whitespace[ch])
12222 break;
12223 j--;
12224 }
12225 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012226 }
12227 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012228 else {
12229 int kind = PyUnicode_KIND(self);
12230 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012231
Victor Stinnercc7af722013-04-09 22:39:24 +020012232 i = 0;
12233 if (striptype != RIGHTSTRIP) {
12234 while (i < len) {
12235 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12236 if (!Py_UNICODE_ISSPACE(ch))
12237 break;
12238 i++;
12239 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012240 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012241
12242 j = len;
12243 if (striptype != LEFTSTRIP) {
12244 j--;
12245 while (j >= i) {
12246 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12247 if (!Py_UNICODE_ISSPACE(ch))
12248 break;
12249 j--;
12250 }
12251 j++;
12252 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012253 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012254
Victor Stinner7931d9a2011-11-04 00:22:48 +010012255 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256}
12257
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012258
12259static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012260do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012261{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012262 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012263
Serhiy Storchakac6792272013-10-19 21:03:34 +030012264 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012265 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012266
Benjamin Peterson14339b62009-01-31 16:36:08 +000012267 if (sep != NULL && sep != Py_None) {
12268 if (PyUnicode_Check(sep))
12269 return _PyUnicode_XStrip(self, striptype, sep);
12270 else {
12271 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012272 "%s arg must be None or str",
12273 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012274 return NULL;
12275 }
12276 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012277
Benjamin Peterson14339b62009-01-31 16:36:08 +000012278 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012279}
12280
12281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012282PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012283 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012284\n\
12285Return a copy of the string S with leading and trailing\n\
12286whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012287If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012288
12289static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012290unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012291{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012292 if (PyTuple_GET_SIZE(args) == 0)
12293 return do_strip(self, BOTHSTRIP); /* Common case */
12294 else
12295 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012296}
12297
12298
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012299PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012300 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012301\n\
12302Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012303If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012304
12305static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012306unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012307{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012308 if (PyTuple_GET_SIZE(args) == 0)
12309 return do_strip(self, LEFTSTRIP); /* Common case */
12310 else
12311 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012312}
12313
12314
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012315PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012316 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012317\n\
12318Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012319If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012320
12321static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012322unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012323{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012324 if (PyTuple_GET_SIZE(args) == 0)
12325 return do_strip(self, RIGHTSTRIP); /* Common case */
12326 else
12327 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012328}
12329
12330
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012332unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012333{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012334 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336
Serhiy Storchaka05997252013-01-26 12:14:02 +020012337 if (len < 1)
12338 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012339
Victor Stinnerc4b49542011-12-11 22:44:26 +010012340 /* no repeat, return original string */
12341 if (len == 1)
12342 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012343
Benjamin Petersonbac79492012-01-14 13:34:47 -050012344 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 return NULL;
12346
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012347 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012348 PyErr_SetString(PyExc_OverflowError,
12349 "repeated string is too long");
12350 return NULL;
12351 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012353
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012354 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355 if (!u)
12356 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012357 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 if (PyUnicode_GET_LENGTH(str) == 1) {
12360 const int kind = PyUnicode_KIND(str);
12361 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012362 if (kind == PyUnicode_1BYTE_KIND) {
12363 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012364 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012365 }
12366 else if (kind == PyUnicode_2BYTE_KIND) {
12367 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012368 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012369 ucs2[n] = fill_char;
12370 } else {
12371 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12372 assert(kind == PyUnicode_4BYTE_KIND);
12373 for (n = 0; n < len; ++n)
12374 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012375 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 }
12377 else {
12378 /* number of characters copied this far */
12379 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012380 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012381 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012382 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012383 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012384 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012385 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012386 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012387 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012388 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012389 }
12390
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012391 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012392 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393}
12394
Alexander Belopolsky40018472011-02-26 01:02:56 +000012395PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012396PyUnicode_Replace(PyObject *str,
12397 PyObject *substr,
12398 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012399 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012401 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12402 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012403 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012404 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012405}
12406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012407PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012408 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012409\n\
12410Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012411old replaced by new. If the optional argument count is\n\
12412given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012413
12414static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 PyObject *str1;
12418 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012419 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012420
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012421 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012423 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012424 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012425 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012426}
12427
Alexander Belopolsky40018472011-02-26 01:02:56 +000012428static PyObject *
12429unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012430{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012431 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 Py_ssize_t isize;
12433 Py_ssize_t osize, squote, dquote, i, o;
12434 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012435 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012436 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012438 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012439 return NULL;
12440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 isize = PyUnicode_GET_LENGTH(unicode);
12442 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 /* Compute length of output, quote characters, and
12445 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012446 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 max = 127;
12448 squote = dquote = 0;
12449 ikind = PyUnicode_KIND(unicode);
12450 for (i = 0; i < isize; i++) {
12451 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012452 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012454 case '\'': squote++; break;
12455 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012457 incr = 2;
12458 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 default:
12460 /* Fast-path ASCII */
12461 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012462 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012464 ;
12465 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012466 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012468 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012469 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012470 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012471 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012472 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012474 if (osize > PY_SSIZE_T_MAX - incr) {
12475 PyErr_SetString(PyExc_OverflowError,
12476 "string is too long to generate repr");
12477 return NULL;
12478 }
12479 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 }
12481
12482 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012483 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012484 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012485 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486 if (dquote)
12487 /* Both squote and dquote present. Use squote,
12488 and escape them */
12489 osize += squote;
12490 else
12491 quote = '"';
12492 }
Victor Stinner55c08782013-04-14 18:45:39 +020012493 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494
12495 repr = PyUnicode_New(osize, max);
12496 if (repr == NULL)
12497 return NULL;
12498 okind = PyUnicode_KIND(repr);
12499 odata = PyUnicode_DATA(repr);
12500
12501 PyUnicode_WRITE(okind, odata, 0, quote);
12502 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012503 if (unchanged) {
12504 _PyUnicode_FastCopyCharacters(repr, 1,
12505 unicode, 0,
12506 isize);
12507 }
12508 else {
12509 for (i = 0, o = 1; i < isize; i++) {
12510 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511
Victor Stinner55c08782013-04-14 18:45:39 +020012512 /* Escape quotes and backslashes */
12513 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012514 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012516 continue;
12517 }
12518
12519 /* Map special whitespace to '\t', \n', '\r' */
12520 if (ch == '\t') {
12521 PyUnicode_WRITE(okind, odata, o++, '\\');
12522 PyUnicode_WRITE(okind, odata, o++, 't');
12523 }
12524 else if (ch == '\n') {
12525 PyUnicode_WRITE(okind, odata, o++, '\\');
12526 PyUnicode_WRITE(okind, odata, o++, 'n');
12527 }
12528 else if (ch == '\r') {
12529 PyUnicode_WRITE(okind, odata, o++, '\\');
12530 PyUnicode_WRITE(okind, odata, o++, 'r');
12531 }
12532
12533 /* Map non-printable US ASCII to '\xhh' */
12534 else if (ch < ' ' || ch == 0x7F) {
12535 PyUnicode_WRITE(okind, odata, o++, '\\');
12536 PyUnicode_WRITE(okind, odata, o++, 'x');
12537 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12538 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12539 }
12540
12541 /* Copy ASCII characters as-is */
12542 else if (ch < 0x7F) {
12543 PyUnicode_WRITE(okind, odata, o++, ch);
12544 }
12545
12546 /* Non-ASCII characters */
12547 else {
12548 /* Map Unicode whitespace and control characters
12549 (categories Z* and C* except ASCII space)
12550 */
12551 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12552 PyUnicode_WRITE(okind, odata, o++, '\\');
12553 /* Map 8-bit characters to '\xhh' */
12554 if (ch <= 0xff) {
12555 PyUnicode_WRITE(okind, odata, o++, 'x');
12556 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12557 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12558 }
12559 /* Map 16-bit characters to '\uxxxx' */
12560 else if (ch <= 0xffff) {
12561 PyUnicode_WRITE(okind, odata, o++, 'u');
12562 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12563 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12564 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12565 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12566 }
12567 /* Map 21-bit characters to '\U00xxxxxx' */
12568 else {
12569 PyUnicode_WRITE(okind, odata, o++, 'U');
12570 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12571 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12572 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12573 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12574 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12575 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12576 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12577 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12578 }
12579 }
12580 /* Copy characters as-is */
12581 else {
12582 PyUnicode_WRITE(okind, odata, o++, ch);
12583 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012584 }
12585 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012588 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012589 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590}
12591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012592PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012593 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594\n\
12595Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012596such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012597arguments start and end are interpreted as in slice notation.\n\
12598\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012599Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012600
12601static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012604 /* initialize variables to prevent gcc warning */
12605 PyObject *substring = NULL;
12606 Py_ssize_t start = 0;
12607 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012608 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012610 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012611 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012612
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012613 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012614 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012615
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012616 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618 if (result == -2)
12619 return NULL;
12620
Christian Heimes217cfd12007-12-02 14:31:20 +000012621 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622}
12623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012624PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012625 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012627Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628
12629static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012632 /* initialize variables to prevent gcc warning */
12633 PyObject *substring = NULL;
12634 Py_ssize_t start = 0;
12635 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012636 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012638 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012639 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012641 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012644 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 if (result == -2)
12647 return NULL;
12648
Guido van Rossumd57fd912000-03-10 22:53:23 +000012649 if (result < 0) {
12650 PyErr_SetString(PyExc_ValueError, "substring not found");
12651 return NULL;
12652 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653
Christian Heimes217cfd12007-12-02 14:31:20 +000012654 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655}
12656
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012657PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012658 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012660Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012661done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662
12663static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012664unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012665{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012666 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 Py_UCS4 fillchar = ' ';
12668
Victor Stinnere9a29352011-10-01 02:14:59 +020012669 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012671
Benjamin Petersonbac79492012-01-14 13:34:47 -050012672 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673 return NULL;
12674
Victor Stinnerc4b49542011-12-11 22:44:26 +010012675 if (PyUnicode_GET_LENGTH(self) >= width)
12676 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677
Victor Stinnerc4b49542011-12-11 22:44:26 +010012678 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679}
12680
Alexander Belopolsky40018472011-02-26 01:02:56 +000012681PyObject *
12682PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012683{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012684 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012685 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012687 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688}
12689
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012690PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012691 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692\n\
12693Return a list of the words in S, using sep as the\n\
12694delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012695splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012696whitespace string is a separator and empty strings are\n\
12697removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698
12699static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012700unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012702 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012704 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012706 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12707 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012708 return NULL;
12709
12710 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012711 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012712
12713 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012714 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012715
12716 PyErr_Format(PyExc_TypeError,
12717 "must be str or None, not %.100s",
12718 Py_TYPE(substring)->tp_name);
12719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720}
12721
Thomas Wouters477c8d52006-05-27 19:21:47 +000012722PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012723PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012724{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012725 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012726 int kind1, kind2;
12727 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012729
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012730 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012731 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012732
Victor Stinner14f8f022011-10-05 20:58:25 +020012733 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 len1 = PyUnicode_GET_LENGTH(str_obj);
12736 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012737 if (kind1 < kind2 || len1 < len2) {
12738 _Py_INCREF_UNICODE_EMPTY();
12739 if (!unicode_empty)
12740 out = NULL;
12741 else {
12742 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12743 Py_DECREF(unicode_empty);
12744 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012745 return out;
12746 }
12747 buf1 = PyUnicode_DATA(str_obj);
12748 buf2 = PyUnicode_DATA(sep_obj);
12749 if (kind2 != kind1) {
12750 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12751 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012752 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012753 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012755 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012756 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012757 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12758 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12759 else
12760 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012761 break;
12762 case PyUnicode_2BYTE_KIND:
12763 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12764 break;
12765 case PyUnicode_4BYTE_KIND:
12766 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12767 break;
12768 default:
12769 assert(0);
12770 out = 0;
12771 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012772
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012773 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012774 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012775
12776 return out;
12777}
12778
12779
12780PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012781PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012782{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012783 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012784 int kind1, kind2;
12785 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012786 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012787
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012788 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012789 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012790
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012791 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012792 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012793 len1 = PyUnicode_GET_LENGTH(str_obj);
12794 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012795 if (kind1 < kind2 || len1 < len2) {
12796 _Py_INCREF_UNICODE_EMPTY();
12797 if (!unicode_empty)
12798 out = NULL;
12799 else {
12800 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12801 Py_DECREF(unicode_empty);
12802 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012803 return out;
12804 }
12805 buf1 = PyUnicode_DATA(str_obj);
12806 buf2 = PyUnicode_DATA(sep_obj);
12807 if (kind2 != kind1) {
12808 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12809 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012810 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012811 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012813 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012814 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012815 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12816 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12817 else
12818 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819 break;
12820 case PyUnicode_2BYTE_KIND:
12821 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12822 break;
12823 case PyUnicode_4BYTE_KIND:
12824 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12825 break;
12826 default:
12827 assert(0);
12828 out = 0;
12829 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012830
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012831 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012832 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012833
12834 return out;
12835}
12836
12837PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012838 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012839\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012840Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012841the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012842found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012843
12844static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012845unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012846{
Victor Stinner9310abb2011-10-05 00:59:23 +020012847 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012848}
12849
12850PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012851 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012852\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012853Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012854the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012855separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012856
12857static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012858unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012859{
Victor Stinner9310abb2011-10-05 00:59:23 +020012860 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012861}
12862
Alexander Belopolsky40018472011-02-26 01:02:56 +000012863PyObject *
12864PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012865{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012866 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012867 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012868
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012869 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012870}
12871
12872PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012873 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012874\n\
12875Return a list of the words in S, using sep as the\n\
12876delimiter string, starting at the end of the string and\n\
12877working to the front. If maxsplit is given, at most maxsplit\n\
12878splits are done. If sep is not specified, any whitespace string\n\
12879is a separator.");
12880
12881static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012882unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012883{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012884 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012885 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012886 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012887
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012888 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12889 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012890 return NULL;
12891
12892 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012893 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012894
12895 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012896 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012897
12898 PyErr_Format(PyExc_TypeError,
12899 "must be str or None, not %.100s",
12900 Py_TYPE(substring)->tp_name);
12901 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012902}
12903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012904PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012905 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012906\n\
12907Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012908Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012909is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012910
12911static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012912unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012913{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012914 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012915 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012916
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012917 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12918 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012919 return NULL;
12920
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012921 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012922}
12923
12924static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012925PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012926{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012927 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012928}
12929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012930PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012931 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932\n\
12933Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012934and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012935
12936static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012937unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012938{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012939 if (PyUnicode_READY(self) == -1)
12940 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012941 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012942}
12943
Larry Hastings61272b72014-01-07 12:41:53 -080012944/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012945
Larry Hastings31826802013-10-19 00:09:25 -070012946@staticmethod
12947str.maketrans as unicode_maketrans
12948
12949 x: object
12950
12951 y: unicode=NULL
12952
12953 z: unicode=NULL
12954
12955 /
12956
12957Return a translation table usable for str.translate().
12958
12959If there is only one argument, it must be a dictionary mapping Unicode
12960ordinals (integers) or characters to Unicode ordinals, strings or None.
12961Character keys will be then converted to ordinals.
12962If there are two arguments, they must be strings of equal length, and
12963in the resulting dictionary, each character in x will be mapped to the
12964character at the same position in y. If there is a third argument, it
12965must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012966[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012967
Larry Hastings31826802013-10-19 00:09:25 -070012968static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012969unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030012970/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012971{
Georg Brandlceee0772007-11-27 23:48:05 +000012972 PyObject *new = NULL, *key, *value;
12973 Py_ssize_t i = 0;
12974 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012975
Georg Brandlceee0772007-11-27 23:48:05 +000012976 new = PyDict_New();
12977 if (!new)
12978 return NULL;
12979 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012980 int x_kind, y_kind, z_kind;
12981 void *x_data, *y_data, *z_data;
12982
Georg Brandlceee0772007-11-27 23:48:05 +000012983 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012984 if (!PyUnicode_Check(x)) {
12985 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12986 "be a string if there is a second argument");
12987 goto err;
12988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012989 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012990 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12991 "arguments must have equal length");
12992 goto err;
12993 }
12994 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012995 x_kind = PyUnicode_KIND(x);
12996 y_kind = PyUnicode_KIND(y);
12997 x_data = PyUnicode_DATA(x);
12998 y_data = PyUnicode_DATA(y);
12999 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13000 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013001 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013002 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013003 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013004 if (!value) {
13005 Py_DECREF(key);
13006 goto err;
13007 }
Georg Brandlceee0772007-11-27 23:48:05 +000013008 res = PyDict_SetItem(new, key, value);
13009 Py_DECREF(key);
13010 Py_DECREF(value);
13011 if (res < 0)
13012 goto err;
13013 }
13014 /* create entries for deleting chars in z */
13015 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013016 z_kind = PyUnicode_KIND(z);
13017 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013018 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013020 if (!key)
13021 goto err;
13022 res = PyDict_SetItem(new, key, Py_None);
13023 Py_DECREF(key);
13024 if (res < 0)
13025 goto err;
13026 }
13027 }
13028 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013029 int kind;
13030 void *data;
13031
Georg Brandlceee0772007-11-27 23:48:05 +000013032 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013033 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013034 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13035 "to maketrans it must be a dict");
13036 goto err;
13037 }
13038 /* copy entries into the new dict, converting string keys to int keys */
13039 while (PyDict_Next(x, &i, &key, &value)) {
13040 if (PyUnicode_Check(key)) {
13041 /* convert string keys to integer keys */
13042 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013043 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013044 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13045 "table must be of length 1");
13046 goto err;
13047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013048 kind = PyUnicode_KIND(key);
13049 data = PyUnicode_DATA(key);
13050 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013051 if (!newkey)
13052 goto err;
13053 res = PyDict_SetItem(new, newkey, value);
13054 Py_DECREF(newkey);
13055 if (res < 0)
13056 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013057 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013058 /* just keep integer keys */
13059 if (PyDict_SetItem(new, key, value) < 0)
13060 goto err;
13061 } else {
13062 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13063 "be strings or integers");
13064 goto err;
13065 }
13066 }
13067 }
13068 return new;
13069 err:
13070 Py_DECREF(new);
13071 return NULL;
13072}
13073
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013074PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013075 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013076\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013077Return a copy of the string S in which each character has been mapped\n\
13078through the given translation table. The table must implement\n\
13079lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13080mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13081this operation raises LookupError, the character is left untouched.\n\
13082Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013083
13084static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013087 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088}
13089
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013090PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013091 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013093Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094
13095static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013096unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013098 if (PyUnicode_READY(self) == -1)
13099 return NULL;
13100 if (PyUnicode_IS_ASCII(self))
13101 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013102 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103}
13104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013105PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013106 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013108Pad a numeric string S with zeros on the left, to fill a field\n\
13109of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110
13111static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013112unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013114 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013115 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013116 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013117 int kind;
13118 void *data;
13119 Py_UCS4 chr;
13120
Martin v. Löwis18e16552006-02-15 17:27:45 +000013121 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122 return NULL;
13123
Benjamin Petersonbac79492012-01-14 13:34:47 -050013124 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013125 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126
Victor Stinnerc4b49542011-12-11 22:44:26 +010013127 if (PyUnicode_GET_LENGTH(self) >= width)
13128 return unicode_result_unchanged(self);
13129
13130 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013131
13132 u = pad(self, fill, 0, '0');
13133
Walter Dörwald068325e2002-04-15 13:36:47 +000013134 if (u == NULL)
13135 return NULL;
13136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013137 kind = PyUnicode_KIND(u);
13138 data = PyUnicode_DATA(u);
13139 chr = PyUnicode_READ(kind, data, fill);
13140
13141 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013143 PyUnicode_WRITE(kind, data, 0, chr);
13144 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145 }
13146
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013147 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013148 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013149}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150
13151#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013152static PyObject *
13153unicode__decimal2ascii(PyObject *self)
13154{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013155 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013156}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157#endif
13158
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013159PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013160 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013161\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013162Return True if S starts with the specified prefix, False otherwise.\n\
13163With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013164With optional end, stop comparing S at that position.\n\
13165prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013166
13167static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013168unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013169 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013170{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013171 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013172 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013173 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013174 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013175 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013176
Jesus Ceaac451502011-04-20 17:09:23 +020013177 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013178 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013179 if (PyTuple_Check(subobj)) {
13180 Py_ssize_t i;
13181 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013182 substring = PyTuple_GET_ITEM(subobj, i);
13183 if (!PyUnicode_Check(substring)) {
13184 PyErr_Format(PyExc_TypeError,
13185 "tuple for startswith must only contain str, "
13186 "not %.100s",
13187 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013188 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013189 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013190 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013191 if (result == -1)
13192 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013193 if (result) {
13194 Py_RETURN_TRUE;
13195 }
13196 }
13197 /* nothing matched */
13198 Py_RETURN_FALSE;
13199 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013200 if (!PyUnicode_Check(subobj)) {
13201 PyErr_Format(PyExc_TypeError,
13202 "startswith first arg must be str or "
13203 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013204 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013205 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013206 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013207 if (result == -1)
13208 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013209 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013210}
13211
13212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013213PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013214 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013216Return True if S ends with the specified suffix, False otherwise.\n\
13217With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013218With optional end, stop comparing S at that position.\n\
13219suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013220
13221static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013222unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013223 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013224{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013225 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013226 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013227 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013228 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013229 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013230
Jesus Ceaac451502011-04-20 17:09:23 +020013231 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013232 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013233 if (PyTuple_Check(subobj)) {
13234 Py_ssize_t i;
13235 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013236 substring = PyTuple_GET_ITEM(subobj, i);
13237 if (!PyUnicode_Check(substring)) {
13238 PyErr_Format(PyExc_TypeError,
13239 "tuple for endswith must only contain str, "
13240 "not %.100s",
13241 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013242 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013243 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013244 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013245 if (result == -1)
13246 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013247 if (result) {
13248 Py_RETURN_TRUE;
13249 }
13250 }
13251 Py_RETURN_FALSE;
13252 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013253 if (!PyUnicode_Check(subobj)) {
13254 PyErr_Format(PyExc_TypeError,
13255 "endswith first arg must be str or "
13256 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013257 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013258 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013259 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013260 if (result == -1)
13261 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013262 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013263}
13264
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013265static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013266_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013267{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013268 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13269 writer->data = PyUnicode_DATA(writer->buffer);
13270
13271 if (!writer->readonly) {
13272 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013273 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013274 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013275 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013276 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13277 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13278 writer->kind = PyUnicode_WCHAR_KIND;
13279 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13280
Victor Stinner8f674cc2013-04-17 23:02:17 +020013281 /* Copy-on-write mode: set buffer size to 0 so
13282 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13283 * next write. */
13284 writer->size = 0;
13285 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013286}
13287
Victor Stinnerd3f08822012-05-29 12:57:52 +020013288void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013289_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013290{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013291 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013292
13293 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013294 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013295
13296 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13297 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13298 writer->kind = PyUnicode_WCHAR_KIND;
13299 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013300}
13301
Victor Stinnerd3f08822012-05-29 12:57:52 +020013302int
13303_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13304 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013305{
13306 Py_ssize_t newlen;
13307 PyObject *newbuffer;
13308
Victor Stinner2740e462016-09-06 16:58:36 -070013309 assert(maxchar <= MAX_UNICODE);
13310
Victor Stinnerca9381e2015-09-22 00:58:32 +020013311 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013312 assert((maxchar > writer->maxchar && length >= 0)
13313 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013314
Victor Stinner202fdca2012-05-07 12:47:02 +020013315 if (length > PY_SSIZE_T_MAX - writer->pos) {
13316 PyErr_NoMemory();
13317 return -1;
13318 }
13319 newlen = writer->pos + length;
13320
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013321 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013322
Victor Stinnerd3f08822012-05-29 12:57:52 +020013323 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013324 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013325 if (writer->overallocate
13326 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13327 /* overallocate to limit the number of realloc() */
13328 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013329 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013330 if (newlen < writer->min_length)
13331 newlen = writer->min_length;
13332
Victor Stinnerd3f08822012-05-29 12:57:52 +020013333 writer->buffer = PyUnicode_New(newlen, maxchar);
13334 if (writer->buffer == NULL)
13335 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013336 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013337 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013338 if (writer->overallocate
13339 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13340 /* overallocate to limit the number of realloc() */
13341 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013342 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013343 if (newlen < writer->min_length)
13344 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013345
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013346 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013347 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013348 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013349 newbuffer = PyUnicode_New(newlen, maxchar);
13350 if (newbuffer == NULL)
13351 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013352 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13353 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013354 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013355 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013356 }
13357 else {
13358 newbuffer = resize_compact(writer->buffer, newlen);
13359 if (newbuffer == NULL)
13360 return -1;
13361 }
13362 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013363 }
13364 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013365 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013366 newbuffer = PyUnicode_New(writer->size, maxchar);
13367 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013368 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013369 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13370 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013371 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013372 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013373 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013374 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013375
13376#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013377}
13378
Victor Stinnerca9381e2015-09-22 00:58:32 +020013379int
13380_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13381 enum PyUnicode_Kind kind)
13382{
13383 Py_UCS4 maxchar;
13384
13385 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13386 assert(writer->kind < kind);
13387
13388 switch (kind)
13389 {
13390 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13391 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13392 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13393 default:
13394 assert(0 && "invalid kind");
13395 return -1;
13396 }
13397
13398 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13399}
13400
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013401static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013402_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013403{
Victor Stinner2740e462016-09-06 16:58:36 -070013404 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013405 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13406 return -1;
13407 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13408 writer->pos++;
13409 return 0;
13410}
13411
13412int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013413_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13414{
13415 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13416}
13417
13418int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013419_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13420{
13421 Py_UCS4 maxchar;
13422 Py_ssize_t len;
13423
13424 if (PyUnicode_READY(str) == -1)
13425 return -1;
13426 len = PyUnicode_GET_LENGTH(str);
13427 if (len == 0)
13428 return 0;
13429 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13430 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013431 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013432 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013433 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013434 Py_INCREF(str);
13435 writer->buffer = str;
13436 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013437 writer->pos += len;
13438 return 0;
13439 }
13440 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13441 return -1;
13442 }
13443 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13444 str, 0, len);
13445 writer->pos += len;
13446 return 0;
13447}
13448
Victor Stinnere215d962012-10-06 23:03:36 +020013449int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013450_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13451 Py_ssize_t start, Py_ssize_t end)
13452{
13453 Py_UCS4 maxchar;
13454 Py_ssize_t len;
13455
13456 if (PyUnicode_READY(str) == -1)
13457 return -1;
13458
13459 assert(0 <= start);
13460 assert(end <= PyUnicode_GET_LENGTH(str));
13461 assert(start <= end);
13462
13463 if (end == 0)
13464 return 0;
13465
13466 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13467 return _PyUnicodeWriter_WriteStr(writer, str);
13468
13469 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13470 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13471 else
13472 maxchar = writer->maxchar;
13473 len = end - start;
13474
13475 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13476 return -1;
13477
13478 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13479 str, start, len);
13480 writer->pos += len;
13481 return 0;
13482}
13483
13484int
Victor Stinner4a587072013-11-19 12:54:53 +010013485_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13486 const char *ascii, Py_ssize_t len)
13487{
13488 if (len == -1)
13489 len = strlen(ascii);
13490
13491 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13492
13493 if (writer->buffer == NULL && !writer->overallocate) {
13494 PyObject *str;
13495
13496 str = _PyUnicode_FromASCII(ascii, len);
13497 if (str == NULL)
13498 return -1;
13499
13500 writer->readonly = 1;
13501 writer->buffer = str;
13502 _PyUnicodeWriter_Update(writer);
13503 writer->pos += len;
13504 return 0;
13505 }
13506
13507 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13508 return -1;
13509
13510 switch (writer->kind)
13511 {
13512 case PyUnicode_1BYTE_KIND:
13513 {
13514 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13515 Py_UCS1 *data = writer->data;
13516
Christian Heimesf051e432016-09-13 20:22:02 +020013517 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013518 break;
13519 }
13520 case PyUnicode_2BYTE_KIND:
13521 {
13522 _PyUnicode_CONVERT_BYTES(
13523 Py_UCS1, Py_UCS2,
13524 ascii, ascii + len,
13525 (Py_UCS2 *)writer->data + writer->pos);
13526 break;
13527 }
13528 case PyUnicode_4BYTE_KIND:
13529 {
13530 _PyUnicode_CONVERT_BYTES(
13531 Py_UCS1, Py_UCS4,
13532 ascii, ascii + len,
13533 (Py_UCS4 *)writer->data + writer->pos);
13534 break;
13535 }
13536 default:
13537 assert(0);
13538 }
13539
13540 writer->pos += len;
13541 return 0;
13542}
13543
13544int
13545_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13546 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013547{
13548 Py_UCS4 maxchar;
13549
13550 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13551 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13552 return -1;
13553 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13554 writer->pos += len;
13555 return 0;
13556}
13557
Victor Stinnerd3f08822012-05-29 12:57:52 +020013558PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013559_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013560{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013561 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013562 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013563 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013564 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013565 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013566 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013567 str = writer->buffer;
13568 writer->buffer = NULL;
13569 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13570 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013571 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013572 if (writer->pos == 0) {
13573 Py_CLEAR(writer->buffer);
13574
13575 /* Get the empty Unicode string singleton ('') */
13576 _Py_INCREF_UNICODE_EMPTY();
13577 str = unicode_empty;
Victor Stinner202fdca2012-05-07 12:47:02 +020013578 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013579 else {
13580 str = writer->buffer;
13581 writer->buffer = NULL;
13582
13583 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13584 PyObject *str2;
13585 str2 = resize_compact(str, writer->pos);
13586 if (str2 == NULL)
13587 return NULL;
13588 str = str2;
13589 }
13590 }
13591
Victor Stinner15a0bd32013-07-08 22:29:55 +020013592 assert(_PyUnicode_CheckConsistency(str, 1));
13593 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013594}
13595
Victor Stinnerd3f08822012-05-29 12:57:52 +020013596void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013597_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013598{
13599 Py_CLEAR(writer->buffer);
13600}
13601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013602#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013603
13604PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013605 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013606\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013607Return a formatted version of S, using substitutions from args and kwargs.\n\
13608The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013609
Eric Smith27bbca62010-11-04 17:06:58 +000013610PyDoc_STRVAR(format_map__doc__,
13611 "S.format_map(mapping) -> str\n\
13612\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013613Return a formatted version of S, using substitutions from mapping.\n\
13614The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013615
Eric Smith4a7d76d2008-05-30 18:10:19 +000013616static PyObject *
13617unicode__format__(PyObject* self, PyObject* args)
13618{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013619 PyObject *format_spec;
13620 _PyUnicodeWriter writer;
13621 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013622
13623 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13624 return NULL;
13625
Victor Stinnerd3f08822012-05-29 12:57:52 +020013626 if (PyUnicode_READY(self) == -1)
13627 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013628 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013629 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13630 self, format_spec, 0,
13631 PyUnicode_GET_LENGTH(format_spec));
13632 if (ret == -1) {
13633 _PyUnicodeWriter_Dealloc(&writer);
13634 return NULL;
13635 }
13636 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013637}
13638
Eric Smith8c663262007-08-25 02:26:07 +000013639PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013640 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013641\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013642Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013643
13644static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013645unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013646{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013647 Py_ssize_t size;
13648
13649 /* If it's a compact object, account for base structure +
13650 character data. */
13651 if (PyUnicode_IS_COMPACT_ASCII(v))
13652 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13653 else if (PyUnicode_IS_COMPACT(v))
13654 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013655 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013656 else {
13657 /* If it is a two-block object, account for base object, and
13658 for character block if present. */
13659 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013660 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013661 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013662 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013663 }
13664 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013665 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013666 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013667 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013668 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013669 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013670
13671 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013672}
13673
13674PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013675 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013676
13677static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013678unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013679{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013680 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013681 if (!copy)
13682 return NULL;
13683 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013684}
13685
Guido van Rossumd57fd912000-03-10 22:53:23 +000013686static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013687 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013688 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013689 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13690 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013691 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13692 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013693 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013694 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13695 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13696 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013697 {"expandtabs", (PyCFunction) unicode_expandtabs,
13698 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013699 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013700 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013701 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13702 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13703 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013704 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013705 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13706 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13707 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013708 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013709 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013710 {"splitlines", (PyCFunction) unicode_splitlines,
13711 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013712 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013713 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13714 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13715 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13716 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13717 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13718 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13719 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13720 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13721 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13722 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13723 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13724 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13725 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13726 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013727 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013728 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013729 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013730 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013731 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013732 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013733 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013734 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013735#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013736 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013737 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013738#endif
13739
Benjamin Peterson14339b62009-01-31 16:36:08 +000013740 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013741 {NULL, NULL}
13742};
13743
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013744static PyObject *
13745unicode_mod(PyObject *v, PyObject *w)
13746{
Brian Curtindfc80e32011-08-10 20:28:54 -050013747 if (!PyUnicode_Check(v))
13748 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013749 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013750}
13751
13752static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013753 0, /*nb_add*/
13754 0, /*nb_subtract*/
13755 0, /*nb_multiply*/
13756 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013757};
13758
Guido van Rossumd57fd912000-03-10 22:53:23 +000013759static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013760 (lenfunc) unicode_length, /* sq_length */
13761 PyUnicode_Concat, /* sq_concat */
13762 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13763 (ssizeargfunc) unicode_getitem, /* sq_item */
13764 0, /* sq_slice */
13765 0, /* sq_ass_item */
13766 0, /* sq_ass_slice */
13767 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013768};
13769
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013770static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013771unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013772{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013773 if (PyUnicode_READY(self) == -1)
13774 return NULL;
13775
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013776 if (PyIndex_Check(item)) {
13777 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013778 if (i == -1 && PyErr_Occurred())
13779 return NULL;
13780 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013781 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013782 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013783 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013784 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013785 PyObject *result;
13786 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013787 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013788 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013790 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013791 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013792 return NULL;
13793 }
13794
13795 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013796 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013797 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013798 slicelength == PyUnicode_GET_LENGTH(self)) {
13799 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013800 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013801 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013802 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013803 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013804 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013805 src_kind = PyUnicode_KIND(self);
13806 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013807 if (!PyUnicode_IS_ASCII(self)) {
13808 kind_limit = kind_maxchar_limit(src_kind);
13809 max_char = 0;
13810 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13811 ch = PyUnicode_READ(src_kind, src_data, cur);
13812 if (ch > max_char) {
13813 max_char = ch;
13814 if (max_char >= kind_limit)
13815 break;
13816 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013817 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013818 }
Victor Stinner55c99112011-10-13 01:17:06 +020013819 else
13820 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013821 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013822 if (result == NULL)
13823 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013824 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013825 dest_data = PyUnicode_DATA(result);
13826
13827 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013828 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13829 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013830 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013831 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013832 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013833 } else {
13834 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13835 return NULL;
13836 }
13837}
13838
13839static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013840 (lenfunc)unicode_length, /* mp_length */
13841 (binaryfunc)unicode_subscript, /* mp_subscript */
13842 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013843};
13844
Guido van Rossumd57fd912000-03-10 22:53:23 +000013845
Guido van Rossumd57fd912000-03-10 22:53:23 +000013846/* Helpers for PyUnicode_Format() */
13847
Victor Stinnera47082312012-10-04 02:19:54 +020013848struct unicode_formatter_t {
13849 PyObject *args;
13850 int args_owned;
13851 Py_ssize_t arglen, argidx;
13852 PyObject *dict;
13853
13854 enum PyUnicode_Kind fmtkind;
13855 Py_ssize_t fmtcnt, fmtpos;
13856 void *fmtdata;
13857 PyObject *fmtstr;
13858
13859 _PyUnicodeWriter writer;
13860};
13861
13862struct unicode_format_arg_t {
13863 Py_UCS4 ch;
13864 int flags;
13865 Py_ssize_t width;
13866 int prec;
13867 int sign;
13868};
13869
Guido van Rossumd57fd912000-03-10 22:53:23 +000013870static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013871unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013872{
Victor Stinnera47082312012-10-04 02:19:54 +020013873 Py_ssize_t argidx = ctx->argidx;
13874
13875 if (argidx < ctx->arglen) {
13876 ctx->argidx++;
13877 if (ctx->arglen < 0)
13878 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013879 else
Victor Stinnera47082312012-10-04 02:19:54 +020013880 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013881 }
13882 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013883 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013884 return NULL;
13885}
13886
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013887/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013888
Victor Stinnera47082312012-10-04 02:19:54 +020013889/* Format a float into the writer if the writer is not NULL, or into *p_output
13890 otherwise.
13891
13892 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013893static int
Victor Stinnera47082312012-10-04 02:19:54 +020013894formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13895 PyObject **p_output,
13896 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013897{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013898 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013899 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013900 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013901 int prec;
13902 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013903
Guido van Rossumd57fd912000-03-10 22:53:23 +000013904 x = PyFloat_AsDouble(v);
13905 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013906 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013907
Victor Stinnera47082312012-10-04 02:19:54 +020013908 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013909 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013910 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013911
Victor Stinnera47082312012-10-04 02:19:54 +020013912 if (arg->flags & F_ALT)
13913 dtoa_flags = Py_DTSF_ALT;
13914 else
13915 dtoa_flags = 0;
13916 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013917 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013918 return -1;
13919 len = strlen(p);
13920 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013921 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013922 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013923 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013924 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013925 }
13926 else
13927 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013928 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013929 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013930}
13931
Victor Stinnerd0880d52012-04-27 23:40:13 +020013932/* formatlong() emulates the format codes d, u, o, x and X, and
13933 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13934 * Python's regular ints.
13935 * Return value: a new PyUnicodeObject*, or NULL if error.
13936 * The output string is of the form
13937 * "-"? ("0x" | "0X")? digit+
13938 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13939 * set in flags. The case of hex digits will be correct,
13940 * There will be at least prec digits, zero-filled on the left if
13941 * necessary to get that many.
13942 * val object to be converted
13943 * flags bitmask of format flags; only F_ALT is looked at
13944 * prec minimum number of digits; 0-fill on left if needed
13945 * type a character in [duoxX]; u acts the same as d
13946 *
13947 * CAUTION: o, x and X conversions on regular ints can never
13948 * produce a '-' sign, but can for Python's unbounded ints.
13949 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013950PyObject *
13951_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013952{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013953 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013954 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013955 Py_ssize_t i;
13956 int sign; /* 1 if '-', else 0 */
13957 int len; /* number of characters */
13958 Py_ssize_t llen;
13959 int numdigits; /* len == numnondigits + numdigits */
13960 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013961
Victor Stinnerd0880d52012-04-27 23:40:13 +020013962 /* Avoid exceeding SSIZE_T_MAX */
13963 if (prec > INT_MAX-3) {
13964 PyErr_SetString(PyExc_OverflowError,
13965 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013966 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013967 }
13968
13969 assert(PyLong_Check(val));
13970
13971 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013972 default:
13973 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013974 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013975 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013976 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013977 /* int and int subclasses should print numerically when a numeric */
13978 /* format code is used (see issue18780) */
13979 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013980 break;
13981 case 'o':
13982 numnondigits = 2;
13983 result = PyNumber_ToBase(val, 8);
13984 break;
13985 case 'x':
13986 case 'X':
13987 numnondigits = 2;
13988 result = PyNumber_ToBase(val, 16);
13989 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013990 }
13991 if (!result)
13992 return NULL;
13993
13994 assert(unicode_modifiable(result));
13995 assert(PyUnicode_IS_READY(result));
13996 assert(PyUnicode_IS_ASCII(result));
13997
13998 /* To modify the string in-place, there can only be one reference. */
13999 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014000 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014001 PyErr_BadInternalCall();
14002 return NULL;
14003 }
14004 buf = PyUnicode_DATA(result);
14005 llen = PyUnicode_GET_LENGTH(result);
14006 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014007 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014008 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014009 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014010 return NULL;
14011 }
14012 len = (int)llen;
14013 sign = buf[0] == '-';
14014 numnondigits += sign;
14015 numdigits = len - numnondigits;
14016 assert(numdigits > 0);
14017
14018 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014019 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014020 (type == 'o' || type == 'x' || type == 'X'))) {
14021 assert(buf[sign] == '0');
14022 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14023 buf[sign+1] == 'o');
14024 numnondigits -= 2;
14025 buf += 2;
14026 len -= 2;
14027 if (sign)
14028 buf[0] = '-';
14029 assert(len == numnondigits + numdigits);
14030 assert(numdigits > 0);
14031 }
14032
14033 /* Fill with leading zeroes to meet minimum width. */
14034 if (prec > numdigits) {
14035 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14036 numnondigits + prec);
14037 char *b1;
14038 if (!r1) {
14039 Py_DECREF(result);
14040 return NULL;
14041 }
14042 b1 = PyBytes_AS_STRING(r1);
14043 for (i = 0; i < numnondigits; ++i)
14044 *b1++ = *buf++;
14045 for (i = 0; i < prec - numdigits; i++)
14046 *b1++ = '0';
14047 for (i = 0; i < numdigits; i++)
14048 *b1++ = *buf++;
14049 *b1 = '\0';
14050 Py_DECREF(result);
14051 result = r1;
14052 buf = PyBytes_AS_STRING(result);
14053 len = numnondigits + prec;
14054 }
14055
14056 /* Fix up case for hex conversions. */
14057 if (type == 'X') {
14058 /* Need to convert all lower case letters to upper case.
14059 and need to convert 0x to 0X (and -0x to -0X). */
14060 for (i = 0; i < len; i++)
14061 if (buf[i] >= 'a' && buf[i] <= 'x')
14062 buf[i] -= 'a'-'A';
14063 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014064 if (!PyUnicode_Check(result)
14065 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014066 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014067 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014068 Py_DECREF(result);
14069 result = unicode;
14070 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014071 else if (len != PyUnicode_GET_LENGTH(result)) {
14072 if (PyUnicode_Resize(&result, len) < 0)
14073 Py_CLEAR(result);
14074 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014075 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014076}
14077
Ethan Furmandf3ed242014-01-05 06:50:30 -080014078/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014079 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014080 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014081 * -1 and raise an exception on error */
14082static int
Victor Stinnera47082312012-10-04 02:19:54 +020014083mainformatlong(PyObject *v,
14084 struct unicode_format_arg_t *arg,
14085 PyObject **p_output,
14086 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014087{
14088 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014089 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014090
14091 if (!PyNumber_Check(v))
14092 goto wrongtype;
14093
Ethan Furman9ab74802014-03-21 06:38:46 -070014094 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014095 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014096 if (type == 'o' || type == 'x' || type == 'X') {
14097 iobj = PyNumber_Index(v);
14098 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014099 if (PyErr_ExceptionMatches(PyExc_TypeError))
14100 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014101 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014102 }
14103 }
14104 else {
14105 iobj = PyNumber_Long(v);
14106 if (iobj == NULL ) {
14107 if (PyErr_ExceptionMatches(PyExc_TypeError))
14108 goto wrongtype;
14109 return -1;
14110 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014111 }
14112 assert(PyLong_Check(iobj));
14113 }
14114 else {
14115 iobj = v;
14116 Py_INCREF(iobj);
14117 }
14118
14119 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014120 && arg->width == -1 && arg->prec == -1
14121 && !(arg->flags & (F_SIGN | F_BLANK))
14122 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014123 {
14124 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014125 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014126 int base;
14127
Victor Stinnera47082312012-10-04 02:19:54 +020014128 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014129 {
14130 default:
14131 assert(0 && "'type' not in [diuoxX]");
14132 case 'd':
14133 case 'i':
14134 case 'u':
14135 base = 10;
14136 break;
14137 case 'o':
14138 base = 8;
14139 break;
14140 case 'x':
14141 case 'X':
14142 base = 16;
14143 break;
14144 }
14145
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014146 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14147 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014148 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014149 }
14150 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014151 return 1;
14152 }
14153
Ethan Furmanb95b5612015-01-23 20:05:18 -080014154 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014155 Py_DECREF(iobj);
14156 if (res == NULL)
14157 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014158 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014159 return 0;
14160
14161wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014162 switch(type)
14163 {
14164 case 'o':
14165 case 'x':
14166 case 'X':
14167 PyErr_Format(PyExc_TypeError,
14168 "%%%c format: an integer is required, "
14169 "not %.200s",
14170 type, Py_TYPE(v)->tp_name);
14171 break;
14172 default:
14173 PyErr_Format(PyExc_TypeError,
14174 "%%%c format: a number is required, "
14175 "not %.200s",
14176 type, Py_TYPE(v)->tp_name);
14177 break;
14178 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014179 return -1;
14180}
14181
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014182static Py_UCS4
14183formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014184{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014185 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014186 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014187 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014188 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014189 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014190 goto onError;
14191 }
14192 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014193 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014194 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014195 /* make sure number is a type of integer */
14196 if (!PyLong_Check(v)) {
14197 iobj = PyNumber_Index(v);
14198 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014199 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014200 }
14201 v = iobj;
14202 Py_DECREF(iobj);
14203 }
14204 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014205 x = PyLong_AsLong(v);
14206 if (x == -1 && PyErr_Occurred())
14207 goto onError;
14208
Victor Stinner8faf8212011-12-08 22:14:11 +010014209 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014210 PyErr_SetString(PyExc_OverflowError,
14211 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014212 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014213 }
14214
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014215 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014216 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014217
Benjamin Peterson29060642009-01-31 22:14:21 +000014218 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014219 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014220 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014221 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014222}
14223
Victor Stinnera47082312012-10-04 02:19:54 +020014224/* Parse options of an argument: flags, width, precision.
14225 Handle also "%(name)" syntax.
14226
14227 Return 0 if the argument has been formatted into arg->str.
14228 Return 1 if the argument has been written into ctx->writer,
14229 Raise an exception and return -1 on error. */
14230static int
14231unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14232 struct unicode_format_arg_t *arg)
14233{
14234#define FORMAT_READ(ctx) \
14235 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14236
14237 PyObject *v;
14238
Victor Stinnera47082312012-10-04 02:19:54 +020014239 if (arg->ch == '(') {
14240 /* Get argument value from a dictionary. Example: "%(name)s". */
14241 Py_ssize_t keystart;
14242 Py_ssize_t keylen;
14243 PyObject *key;
14244 int pcount = 1;
14245
14246 if (ctx->dict == NULL) {
14247 PyErr_SetString(PyExc_TypeError,
14248 "format requires a mapping");
14249 return -1;
14250 }
14251 ++ctx->fmtpos;
14252 --ctx->fmtcnt;
14253 keystart = ctx->fmtpos;
14254 /* Skip over balanced parentheses */
14255 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14256 arg->ch = FORMAT_READ(ctx);
14257 if (arg->ch == ')')
14258 --pcount;
14259 else if (arg->ch == '(')
14260 ++pcount;
14261 ctx->fmtpos++;
14262 }
14263 keylen = ctx->fmtpos - keystart - 1;
14264 if (ctx->fmtcnt < 0 || pcount > 0) {
14265 PyErr_SetString(PyExc_ValueError,
14266 "incomplete format key");
14267 return -1;
14268 }
14269 key = PyUnicode_Substring(ctx->fmtstr,
14270 keystart, keystart + keylen);
14271 if (key == NULL)
14272 return -1;
14273 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014274 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014275 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014276 }
14277 ctx->args = PyObject_GetItem(ctx->dict, key);
14278 Py_DECREF(key);
14279 if (ctx->args == NULL)
14280 return -1;
14281 ctx->args_owned = 1;
14282 ctx->arglen = -1;
14283 ctx->argidx = -2;
14284 }
14285
14286 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014287 while (--ctx->fmtcnt >= 0) {
14288 arg->ch = FORMAT_READ(ctx);
14289 ctx->fmtpos++;
14290 switch (arg->ch) {
14291 case '-': arg->flags |= F_LJUST; continue;
14292 case '+': arg->flags |= F_SIGN; continue;
14293 case ' ': arg->flags |= F_BLANK; continue;
14294 case '#': arg->flags |= F_ALT; continue;
14295 case '0': arg->flags |= F_ZERO; continue;
14296 }
14297 break;
14298 }
14299
14300 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014301 if (arg->ch == '*') {
14302 v = unicode_format_getnextarg(ctx);
14303 if (v == NULL)
14304 return -1;
14305 if (!PyLong_Check(v)) {
14306 PyErr_SetString(PyExc_TypeError,
14307 "* wants int");
14308 return -1;
14309 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014310 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014311 if (arg->width == -1 && PyErr_Occurred())
14312 return -1;
14313 if (arg->width < 0) {
14314 arg->flags |= F_LJUST;
14315 arg->width = -arg->width;
14316 }
14317 if (--ctx->fmtcnt >= 0) {
14318 arg->ch = FORMAT_READ(ctx);
14319 ctx->fmtpos++;
14320 }
14321 }
14322 else if (arg->ch >= '0' && arg->ch <= '9') {
14323 arg->width = arg->ch - '0';
14324 while (--ctx->fmtcnt >= 0) {
14325 arg->ch = FORMAT_READ(ctx);
14326 ctx->fmtpos++;
14327 if (arg->ch < '0' || arg->ch > '9')
14328 break;
14329 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14330 mixing signed and unsigned comparison. Since arg->ch is between
14331 '0' and '9', casting to int is safe. */
14332 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14333 PyErr_SetString(PyExc_ValueError,
14334 "width too big");
14335 return -1;
14336 }
14337 arg->width = arg->width*10 + (arg->ch - '0');
14338 }
14339 }
14340
14341 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014342 if (arg->ch == '.') {
14343 arg->prec = 0;
14344 if (--ctx->fmtcnt >= 0) {
14345 arg->ch = FORMAT_READ(ctx);
14346 ctx->fmtpos++;
14347 }
14348 if (arg->ch == '*') {
14349 v = unicode_format_getnextarg(ctx);
14350 if (v == NULL)
14351 return -1;
14352 if (!PyLong_Check(v)) {
14353 PyErr_SetString(PyExc_TypeError,
14354 "* wants int");
14355 return -1;
14356 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014357 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014358 if (arg->prec == -1 && PyErr_Occurred())
14359 return -1;
14360 if (arg->prec < 0)
14361 arg->prec = 0;
14362 if (--ctx->fmtcnt >= 0) {
14363 arg->ch = FORMAT_READ(ctx);
14364 ctx->fmtpos++;
14365 }
14366 }
14367 else if (arg->ch >= '0' && arg->ch <= '9') {
14368 arg->prec = arg->ch - '0';
14369 while (--ctx->fmtcnt >= 0) {
14370 arg->ch = FORMAT_READ(ctx);
14371 ctx->fmtpos++;
14372 if (arg->ch < '0' || arg->ch > '9')
14373 break;
14374 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14375 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014376 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014377 return -1;
14378 }
14379 arg->prec = arg->prec*10 + (arg->ch - '0');
14380 }
14381 }
14382 }
14383
14384 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14385 if (ctx->fmtcnt >= 0) {
14386 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14387 if (--ctx->fmtcnt >= 0) {
14388 arg->ch = FORMAT_READ(ctx);
14389 ctx->fmtpos++;
14390 }
14391 }
14392 }
14393 if (ctx->fmtcnt < 0) {
14394 PyErr_SetString(PyExc_ValueError,
14395 "incomplete format");
14396 return -1;
14397 }
14398 return 0;
14399
14400#undef FORMAT_READ
14401}
14402
14403/* Format one argument. Supported conversion specifiers:
14404
14405 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014406 - "i", "d", "u": int or float
14407 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014408 - "e", "E", "f", "F", "g", "G": float
14409 - "c": int or str (1 character)
14410
Victor Stinner8dbd4212012-12-04 09:30:24 +010014411 When possible, the output is written directly into the Unicode writer
14412 (ctx->writer). A string is created when padding is required.
14413
Victor Stinnera47082312012-10-04 02:19:54 +020014414 Return 0 if the argument has been formatted into *p_str,
14415 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014416 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014417static int
14418unicode_format_arg_format(struct unicode_formatter_t *ctx,
14419 struct unicode_format_arg_t *arg,
14420 PyObject **p_str)
14421{
14422 PyObject *v;
14423 _PyUnicodeWriter *writer = &ctx->writer;
14424
14425 if (ctx->fmtcnt == 0)
14426 ctx->writer.overallocate = 0;
14427
14428 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014429 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014430 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014431 return 1;
14432 }
14433
14434 v = unicode_format_getnextarg(ctx);
14435 if (v == NULL)
14436 return -1;
14437
Victor Stinnera47082312012-10-04 02:19:54 +020014438
14439 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014440 case 's':
14441 case 'r':
14442 case 'a':
14443 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14444 /* Fast path */
14445 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14446 return -1;
14447 return 1;
14448 }
14449
14450 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14451 *p_str = v;
14452 Py_INCREF(*p_str);
14453 }
14454 else {
14455 if (arg->ch == 's')
14456 *p_str = PyObject_Str(v);
14457 else if (arg->ch == 'r')
14458 *p_str = PyObject_Repr(v);
14459 else
14460 *p_str = PyObject_ASCII(v);
14461 }
14462 break;
14463
14464 case 'i':
14465 case 'd':
14466 case 'u':
14467 case 'o':
14468 case 'x':
14469 case 'X':
14470 {
14471 int ret = mainformatlong(v, arg, p_str, writer);
14472 if (ret != 0)
14473 return ret;
14474 arg->sign = 1;
14475 break;
14476 }
14477
14478 case 'e':
14479 case 'E':
14480 case 'f':
14481 case 'F':
14482 case 'g':
14483 case 'G':
14484 if (arg->width == -1 && arg->prec == -1
14485 && !(arg->flags & (F_SIGN | F_BLANK)))
14486 {
14487 /* Fast path */
14488 if (formatfloat(v, arg, NULL, writer) == -1)
14489 return -1;
14490 return 1;
14491 }
14492
14493 arg->sign = 1;
14494 if (formatfloat(v, arg, p_str, NULL) == -1)
14495 return -1;
14496 break;
14497
14498 case 'c':
14499 {
14500 Py_UCS4 ch = formatchar(v);
14501 if (ch == (Py_UCS4) -1)
14502 return -1;
14503 if (arg->width == -1 && arg->prec == -1) {
14504 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014505 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014506 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014507 return 1;
14508 }
14509 *p_str = PyUnicode_FromOrdinal(ch);
14510 break;
14511 }
14512
14513 default:
14514 PyErr_Format(PyExc_ValueError,
14515 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014516 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014517 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14518 (int)arg->ch,
14519 ctx->fmtpos - 1);
14520 return -1;
14521 }
14522 if (*p_str == NULL)
14523 return -1;
14524 assert (PyUnicode_Check(*p_str));
14525 return 0;
14526}
14527
14528static int
14529unicode_format_arg_output(struct unicode_formatter_t *ctx,
14530 struct unicode_format_arg_t *arg,
14531 PyObject *str)
14532{
14533 Py_ssize_t len;
14534 enum PyUnicode_Kind kind;
14535 void *pbuf;
14536 Py_ssize_t pindex;
14537 Py_UCS4 signchar;
14538 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014539 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014540 Py_ssize_t sublen;
14541 _PyUnicodeWriter *writer = &ctx->writer;
14542 Py_UCS4 fill;
14543
14544 fill = ' ';
14545 if (arg->sign && arg->flags & F_ZERO)
14546 fill = '0';
14547
14548 if (PyUnicode_READY(str) == -1)
14549 return -1;
14550
14551 len = PyUnicode_GET_LENGTH(str);
14552 if ((arg->width == -1 || arg->width <= len)
14553 && (arg->prec == -1 || arg->prec >= len)
14554 && !(arg->flags & (F_SIGN | F_BLANK)))
14555 {
14556 /* Fast path */
14557 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14558 return -1;
14559 return 0;
14560 }
14561
14562 /* Truncate the string for "s", "r" and "a" formats
14563 if the precision is set */
14564 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14565 if (arg->prec >= 0 && len > arg->prec)
14566 len = arg->prec;
14567 }
14568
14569 /* Adjust sign and width */
14570 kind = PyUnicode_KIND(str);
14571 pbuf = PyUnicode_DATA(str);
14572 pindex = 0;
14573 signchar = '\0';
14574 if (arg->sign) {
14575 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14576 if (ch == '-' || ch == '+') {
14577 signchar = ch;
14578 len--;
14579 pindex++;
14580 }
14581 else if (arg->flags & F_SIGN)
14582 signchar = '+';
14583 else if (arg->flags & F_BLANK)
14584 signchar = ' ';
14585 else
14586 arg->sign = 0;
14587 }
14588 if (arg->width < len)
14589 arg->width = len;
14590
14591 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014592 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014593 if (!(arg->flags & F_LJUST)) {
14594 if (arg->sign) {
14595 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014596 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014597 }
14598 else {
14599 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014600 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014601 }
14602 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014603 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14604 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014605 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014606 }
14607
Victor Stinnera47082312012-10-04 02:19:54 +020014608 buflen = arg->width;
14609 if (arg->sign && len == arg->width)
14610 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014611 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014612 return -1;
14613
14614 /* Write the sign if needed */
14615 if (arg->sign) {
14616 if (fill != ' ') {
14617 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14618 writer->pos += 1;
14619 }
14620 if (arg->width > len)
14621 arg->width--;
14622 }
14623
14624 /* Write the numeric prefix for "x", "X" and "o" formats
14625 if the alternate form is used.
14626 For example, write "0x" for the "%#x" format. */
14627 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14628 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14629 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14630 if (fill != ' ') {
14631 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14632 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14633 writer->pos += 2;
14634 pindex += 2;
14635 }
14636 arg->width -= 2;
14637 if (arg->width < 0)
14638 arg->width = 0;
14639 len -= 2;
14640 }
14641
14642 /* Pad left with the fill character if needed */
14643 if (arg->width > len && !(arg->flags & F_LJUST)) {
14644 sublen = arg->width - len;
14645 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14646 writer->pos += sublen;
14647 arg->width = len;
14648 }
14649
14650 /* If padding with spaces: write sign if needed and/or numeric prefix if
14651 the alternate form is used */
14652 if (fill == ' ') {
14653 if (arg->sign) {
14654 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14655 writer->pos += 1;
14656 }
14657 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14658 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14659 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14660 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14661 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14662 writer->pos += 2;
14663 pindex += 2;
14664 }
14665 }
14666
14667 /* Write characters */
14668 if (len) {
14669 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14670 str, pindex, len);
14671 writer->pos += len;
14672 }
14673
14674 /* Pad right with the fill character if needed */
14675 if (arg->width > len) {
14676 sublen = arg->width - len;
14677 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14678 writer->pos += sublen;
14679 }
14680 return 0;
14681}
14682
14683/* Helper of PyUnicode_Format(): format one arg.
14684 Return 0 on success, raise an exception and return -1 on error. */
14685static int
14686unicode_format_arg(struct unicode_formatter_t *ctx)
14687{
14688 struct unicode_format_arg_t arg;
14689 PyObject *str;
14690 int ret;
14691
Victor Stinner8dbd4212012-12-04 09:30:24 +010014692 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14693 arg.flags = 0;
14694 arg.width = -1;
14695 arg.prec = -1;
14696 arg.sign = 0;
14697 str = NULL;
14698
Victor Stinnera47082312012-10-04 02:19:54 +020014699 ret = unicode_format_arg_parse(ctx, &arg);
14700 if (ret == -1)
14701 return -1;
14702
14703 ret = unicode_format_arg_format(ctx, &arg, &str);
14704 if (ret == -1)
14705 return -1;
14706
14707 if (ret != 1) {
14708 ret = unicode_format_arg_output(ctx, &arg, str);
14709 Py_DECREF(str);
14710 if (ret == -1)
14711 return -1;
14712 }
14713
14714 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14715 PyErr_SetString(PyExc_TypeError,
14716 "not all arguments converted during string formatting");
14717 return -1;
14718 }
14719 return 0;
14720}
14721
Alexander Belopolsky40018472011-02-26 01:02:56 +000014722PyObject *
14723PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014724{
Victor Stinnera47082312012-10-04 02:19:54 +020014725 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014726
Guido van Rossumd57fd912000-03-10 22:53:23 +000014727 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014728 PyErr_BadInternalCall();
14729 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014730 }
Victor Stinnera47082312012-10-04 02:19:54 +020014731
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014732 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014733 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014734
14735 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014736 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14737 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14738 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14739 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014740
Victor Stinner8f674cc2013-04-17 23:02:17 +020014741 _PyUnicodeWriter_Init(&ctx.writer);
14742 ctx.writer.min_length = ctx.fmtcnt + 100;
14743 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014744
Guido van Rossumd57fd912000-03-10 22:53:23 +000014745 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014746 ctx.arglen = PyTuple_Size(args);
14747 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014748 }
14749 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014750 ctx.arglen = -1;
14751 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014752 }
Victor Stinnera47082312012-10-04 02:19:54 +020014753 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014754 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014755 ctx.dict = args;
14756 else
14757 ctx.dict = NULL;
14758 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014759
Victor Stinnera47082312012-10-04 02:19:54 +020014760 while (--ctx.fmtcnt >= 0) {
14761 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014762 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014763
14764 nonfmtpos = ctx.fmtpos++;
14765 while (ctx.fmtcnt >= 0 &&
14766 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14767 ctx.fmtpos++;
14768 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014769 }
Victor Stinnera47082312012-10-04 02:19:54 +020014770 if (ctx.fmtcnt < 0) {
14771 ctx.fmtpos--;
14772 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014773 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014774
Victor Stinnercfc4c132013-04-03 01:48:39 +020014775 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14776 nonfmtpos, ctx.fmtpos) < 0)
14777 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014778 }
14779 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014780 ctx.fmtpos++;
14781 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014782 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014783 }
14784 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014785
Victor Stinnera47082312012-10-04 02:19:54 +020014786 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014787 PyErr_SetString(PyExc_TypeError,
14788 "not all arguments converted during string formatting");
14789 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014790 }
14791
Victor Stinnera47082312012-10-04 02:19:54 +020014792 if (ctx.args_owned) {
14793 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014794 }
Victor Stinnera47082312012-10-04 02:19:54 +020014795 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014796
Benjamin Peterson29060642009-01-31 22:14:21 +000014797 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014798 _PyUnicodeWriter_Dealloc(&ctx.writer);
14799 if (ctx.args_owned) {
14800 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014801 }
14802 return NULL;
14803}
14804
Jeremy Hylton938ace62002-07-17 16:30:39 +000014805static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014806unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14807
Tim Peters6d6c1a32001-08-02 04:15:00 +000014808static PyObject *
14809unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14810{
Benjamin Peterson29060642009-01-31 22:14:21 +000014811 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014812 static char *kwlist[] = {"object", "encoding", "errors", 0};
14813 char *encoding = NULL;
14814 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014815
Benjamin Peterson14339b62009-01-31 16:36:08 +000014816 if (type != &PyUnicode_Type)
14817 return unicode_subtype_new(type, args, kwds);
14818 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014819 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014820 return NULL;
14821 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014822 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014823 if (encoding == NULL && errors == NULL)
14824 return PyObject_Str(x);
14825 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014826 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014827}
14828
Guido van Rossume023fe02001-08-30 03:12:59 +000014829static PyObject *
14830unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14831{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014832 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014833 Py_ssize_t length, char_size;
14834 int share_wstr, share_utf8;
14835 unsigned int kind;
14836 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014837
Benjamin Peterson14339b62009-01-31 16:36:08 +000014838 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014839
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014840 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014841 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014842 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014843 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014844 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014845 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014846 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014847 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014848
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014849 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014850 if (self == NULL) {
14851 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014852 return NULL;
14853 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014854 kind = PyUnicode_KIND(unicode);
14855 length = PyUnicode_GET_LENGTH(unicode);
14856
14857 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014858#ifdef Py_DEBUG
14859 _PyUnicode_HASH(self) = -1;
14860#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014861 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014862#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014863 _PyUnicode_STATE(self).interned = 0;
14864 _PyUnicode_STATE(self).kind = kind;
14865 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014866 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014867 _PyUnicode_STATE(self).ready = 1;
14868 _PyUnicode_WSTR(self) = NULL;
14869 _PyUnicode_UTF8_LENGTH(self) = 0;
14870 _PyUnicode_UTF8(self) = NULL;
14871 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014872 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014873
14874 share_utf8 = 0;
14875 share_wstr = 0;
14876 if (kind == PyUnicode_1BYTE_KIND) {
14877 char_size = 1;
14878 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14879 share_utf8 = 1;
14880 }
14881 else if (kind == PyUnicode_2BYTE_KIND) {
14882 char_size = 2;
14883 if (sizeof(wchar_t) == 2)
14884 share_wstr = 1;
14885 }
14886 else {
14887 assert(kind == PyUnicode_4BYTE_KIND);
14888 char_size = 4;
14889 if (sizeof(wchar_t) == 4)
14890 share_wstr = 1;
14891 }
14892
14893 /* Ensure we won't overflow the length. */
14894 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14895 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014896 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014897 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014898 data = PyObject_MALLOC((length + 1) * char_size);
14899 if (data == NULL) {
14900 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014901 goto onError;
14902 }
14903
Victor Stinnerc3c74152011-10-02 20:39:55 +020014904 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014905 if (share_utf8) {
14906 _PyUnicode_UTF8_LENGTH(self) = length;
14907 _PyUnicode_UTF8(self) = data;
14908 }
14909 if (share_wstr) {
14910 _PyUnicode_WSTR_LENGTH(self) = length;
14911 _PyUnicode_WSTR(self) = (wchar_t *)data;
14912 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014913
Christian Heimesf051e432016-09-13 20:22:02 +020014914 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014915 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014916 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014917#ifdef Py_DEBUG
14918 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14919#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014920 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014921 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014922
14923onError:
14924 Py_DECREF(unicode);
14925 Py_DECREF(self);
14926 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014927}
14928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014929PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014930"str(object='') -> str\n\
14931str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014932\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014933Create a new string object from the given object. If encoding or\n\
14934errors is specified, then the object must expose a data buffer\n\
14935that will be decoded using the given encoding and error handler.\n\
14936Otherwise, returns the result of object.__str__() (if defined)\n\
14937or repr(object).\n\
14938encoding defaults to sys.getdefaultencoding().\n\
14939errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014940
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014941static PyObject *unicode_iter(PyObject *seq);
14942
Guido van Rossumd57fd912000-03-10 22:53:23 +000014943PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014944 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014945 "str", /* tp_name */
14946 sizeof(PyUnicodeObject), /* tp_size */
14947 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014948 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014949 (destructor)unicode_dealloc, /* tp_dealloc */
14950 0, /* tp_print */
14951 0, /* tp_getattr */
14952 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014953 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014954 unicode_repr, /* tp_repr */
14955 &unicode_as_number, /* tp_as_number */
14956 &unicode_as_sequence, /* tp_as_sequence */
14957 &unicode_as_mapping, /* tp_as_mapping */
14958 (hashfunc) unicode_hash, /* tp_hash*/
14959 0, /* tp_call*/
14960 (reprfunc) unicode_str, /* tp_str */
14961 PyObject_GenericGetAttr, /* tp_getattro */
14962 0, /* tp_setattro */
14963 0, /* tp_as_buffer */
14964 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014965 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014966 unicode_doc, /* tp_doc */
14967 0, /* tp_traverse */
14968 0, /* tp_clear */
14969 PyUnicode_RichCompare, /* tp_richcompare */
14970 0, /* tp_weaklistoffset */
14971 unicode_iter, /* tp_iter */
14972 0, /* tp_iternext */
14973 unicode_methods, /* tp_methods */
14974 0, /* tp_members */
14975 0, /* tp_getset */
14976 &PyBaseObject_Type, /* tp_base */
14977 0, /* tp_dict */
14978 0, /* tp_descr_get */
14979 0, /* tp_descr_set */
14980 0, /* tp_dictoffset */
14981 0, /* tp_init */
14982 0, /* tp_alloc */
14983 unicode_new, /* tp_new */
14984 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014985};
14986
14987/* Initialize the Unicode implementation */
14988
Victor Stinner3a50e702011-10-18 21:21:00 +020014989int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014990{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014991 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014992 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014993 0x000A, /* LINE FEED */
14994 0x000D, /* CARRIAGE RETURN */
14995 0x001C, /* FILE SEPARATOR */
14996 0x001D, /* GROUP SEPARATOR */
14997 0x001E, /* RECORD SEPARATOR */
14998 0x0085, /* NEXT LINE */
14999 0x2028, /* LINE SEPARATOR */
15000 0x2029, /* PARAGRAPH SEPARATOR */
15001 };
15002
Fred Drakee4315f52000-05-09 19:53:39 +000015003 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015004 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015005 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015006 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015007 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015008
Guido van Rossumcacfc072002-05-24 19:01:59 +000015009 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015010 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015011
15012 /* initialize the linebreak bloom filter */
15013 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015014 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015015 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015016
Christian Heimes26532f72013-07-20 14:57:16 +020015017 if (PyType_Ready(&EncodingMapType) < 0)
15018 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015019
Benjamin Petersonc4311282012-10-30 23:21:10 -040015020 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15021 Py_FatalError("Can't initialize field name iterator type");
15022
15023 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15024 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015025
Victor Stinner3a50e702011-10-18 21:21:00 +020015026 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015027}
15028
15029/* Finalize the Unicode implementation */
15030
Christian Heimesa156e092008-02-16 07:38:31 +000015031int
15032PyUnicode_ClearFreeList(void)
15033{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015034 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015035}
15036
Guido van Rossumd57fd912000-03-10 22:53:23 +000015037void
Thomas Wouters78890102000-07-22 19:25:51 +000015038_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015039{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015040 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015041
Serhiy Storchaka05997252013-01-26 12:14:02 +020015042 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015043
Serhiy Storchaka05997252013-01-26 12:14:02 +020015044 for (i = 0; i < 256; i++)
15045 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015046 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015047 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015048}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015049
Walter Dörwald16807132007-05-25 13:52:07 +000015050void
15051PyUnicode_InternInPlace(PyObject **p)
15052{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015053 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015054 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015055#ifdef Py_DEBUG
15056 assert(s != NULL);
15057 assert(_PyUnicode_CHECK(s));
15058#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015059 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015060 return;
15061#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015062 /* If it's a subclass, we don't really know what putting
15063 it in the interned dict might do. */
15064 if (!PyUnicode_CheckExact(s))
15065 return;
15066 if (PyUnicode_CHECK_INTERNED(s))
15067 return;
15068 if (interned == NULL) {
15069 interned = PyDict_New();
15070 if (interned == NULL) {
15071 PyErr_Clear(); /* Don't leave an exception */
15072 return;
15073 }
15074 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015075 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015076 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015077 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015078 if (t == NULL) {
15079 PyErr_Clear();
15080 return;
15081 }
15082 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015083 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015084 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015085 return;
15086 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015087 /* The two references in interned are not counted by refcnt.
15088 The deallocator will take care of this */
15089 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015090 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015091}
15092
15093void
15094PyUnicode_InternImmortal(PyObject **p)
15095{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015096 PyUnicode_InternInPlace(p);
15097 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015098 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015099 Py_INCREF(*p);
15100 }
Walter Dörwald16807132007-05-25 13:52:07 +000015101}
15102
15103PyObject *
15104PyUnicode_InternFromString(const char *cp)
15105{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015106 PyObject *s = PyUnicode_FromString(cp);
15107 if (s == NULL)
15108 return NULL;
15109 PyUnicode_InternInPlace(&s);
15110 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015111}
15112
Alexander Belopolsky40018472011-02-26 01:02:56 +000015113void
15114_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015115{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015116 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015117 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015118 Py_ssize_t i, n;
15119 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015120
Benjamin Peterson14339b62009-01-31 16:36:08 +000015121 if (interned == NULL || !PyDict_Check(interned))
15122 return;
15123 keys = PyDict_Keys(interned);
15124 if (keys == NULL || !PyList_Check(keys)) {
15125 PyErr_Clear();
15126 return;
15127 }
Walter Dörwald16807132007-05-25 13:52:07 +000015128
Benjamin Peterson14339b62009-01-31 16:36:08 +000015129 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15130 detector, interned unicode strings are not forcibly deallocated;
15131 rather, we give them their stolen references back, and then clear
15132 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015133
Benjamin Peterson14339b62009-01-31 16:36:08 +000015134 n = PyList_GET_SIZE(keys);
15135 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015136 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015137 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015138 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015139 if (PyUnicode_READY(s) == -1) {
15140 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015141 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015142 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015143 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015144 case SSTATE_NOT_INTERNED:
15145 /* XXX Shouldn't happen */
15146 break;
15147 case SSTATE_INTERNED_IMMORTAL:
15148 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015149 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015150 break;
15151 case SSTATE_INTERNED_MORTAL:
15152 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015153 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015154 break;
15155 default:
15156 Py_FatalError("Inconsistent interned string state.");
15157 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015158 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015159 }
15160 fprintf(stderr, "total size of all interned strings: "
15161 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15162 "mortal/immortal\n", mortal_size, immortal_size);
15163 Py_DECREF(keys);
15164 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015165 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015166}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015167
15168
15169/********************* Unicode Iterator **************************/
15170
15171typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015172 PyObject_HEAD
15173 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015174 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015175} unicodeiterobject;
15176
15177static void
15178unicodeiter_dealloc(unicodeiterobject *it)
15179{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015180 _PyObject_GC_UNTRACK(it);
15181 Py_XDECREF(it->it_seq);
15182 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015183}
15184
15185static int
15186unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15187{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015188 Py_VISIT(it->it_seq);
15189 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015190}
15191
15192static PyObject *
15193unicodeiter_next(unicodeiterobject *it)
15194{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015195 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015196
Benjamin Peterson14339b62009-01-31 16:36:08 +000015197 assert(it != NULL);
15198 seq = it->it_seq;
15199 if (seq == NULL)
15200 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015201 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015203 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15204 int kind = PyUnicode_KIND(seq);
15205 void *data = PyUnicode_DATA(seq);
15206 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15207 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015208 if (item != NULL)
15209 ++it->it_index;
15210 return item;
15211 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015212
Benjamin Peterson14339b62009-01-31 16:36:08 +000015213 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015214 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015215 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015216}
15217
15218static PyObject *
15219unicodeiter_len(unicodeiterobject *it)
15220{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015221 Py_ssize_t len = 0;
15222 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015223 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015224 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015225}
15226
15227PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15228
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015229static PyObject *
15230unicodeiter_reduce(unicodeiterobject *it)
15231{
15232 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015233 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015234 it->it_seq, it->it_index);
15235 } else {
15236 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15237 if (u == NULL)
15238 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015239 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015240 }
15241}
15242
15243PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15244
15245static PyObject *
15246unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15247{
15248 Py_ssize_t index = PyLong_AsSsize_t(state);
15249 if (index == -1 && PyErr_Occurred())
15250 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015251 if (it->it_seq != NULL) {
15252 if (index < 0)
15253 index = 0;
15254 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15255 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15256 it->it_index = index;
15257 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015258 Py_RETURN_NONE;
15259}
15260
15261PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15262
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015263static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015264 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015265 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015266 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15267 reduce_doc},
15268 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15269 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015270 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015271};
15272
15273PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015274 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15275 "str_iterator", /* tp_name */
15276 sizeof(unicodeiterobject), /* tp_basicsize */
15277 0, /* tp_itemsize */
15278 /* methods */
15279 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15280 0, /* tp_print */
15281 0, /* tp_getattr */
15282 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015283 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015284 0, /* tp_repr */
15285 0, /* tp_as_number */
15286 0, /* tp_as_sequence */
15287 0, /* tp_as_mapping */
15288 0, /* tp_hash */
15289 0, /* tp_call */
15290 0, /* tp_str */
15291 PyObject_GenericGetAttr, /* tp_getattro */
15292 0, /* tp_setattro */
15293 0, /* tp_as_buffer */
15294 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15295 0, /* tp_doc */
15296 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15297 0, /* tp_clear */
15298 0, /* tp_richcompare */
15299 0, /* tp_weaklistoffset */
15300 PyObject_SelfIter, /* tp_iter */
15301 (iternextfunc)unicodeiter_next, /* tp_iternext */
15302 unicodeiter_methods, /* tp_methods */
15303 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015304};
15305
15306static PyObject *
15307unicode_iter(PyObject *seq)
15308{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015309 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015310
Benjamin Peterson14339b62009-01-31 16:36:08 +000015311 if (!PyUnicode_Check(seq)) {
15312 PyErr_BadInternalCall();
15313 return NULL;
15314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015315 if (PyUnicode_READY(seq) == -1)
15316 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015317 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15318 if (it == NULL)
15319 return NULL;
15320 it->it_index = 0;
15321 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015322 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015323 _PyObject_GC_TRACK(it);
15324 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015325}
15326
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015327
15328size_t
15329Py_UNICODE_strlen(const Py_UNICODE *u)
15330{
15331 int res = 0;
15332 while(*u++)
15333 res++;
15334 return res;
15335}
15336
15337Py_UNICODE*
15338Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15339{
15340 Py_UNICODE *u = s1;
15341 while ((*u++ = *s2++));
15342 return s1;
15343}
15344
15345Py_UNICODE*
15346Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15347{
15348 Py_UNICODE *u = s1;
15349 while ((*u++ = *s2++))
15350 if (n-- == 0)
15351 break;
15352 return s1;
15353}
15354
15355Py_UNICODE*
15356Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15357{
15358 Py_UNICODE *u1 = s1;
15359 u1 += Py_UNICODE_strlen(u1);
15360 Py_UNICODE_strcpy(u1, s2);
15361 return s1;
15362}
15363
15364int
15365Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15366{
15367 while (*s1 && *s2 && *s1 == *s2)
15368 s1++, s2++;
15369 if (*s1 && *s2)
15370 return (*s1 < *s2) ? -1 : +1;
15371 if (*s1)
15372 return 1;
15373 if (*s2)
15374 return -1;
15375 return 0;
15376}
15377
15378int
15379Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15380{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015381 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015382 for (; n != 0; n--) {
15383 u1 = *s1;
15384 u2 = *s2;
15385 if (u1 != u2)
15386 return (u1 < u2) ? -1 : +1;
15387 if (u1 == '\0')
15388 return 0;
15389 s1++;
15390 s2++;
15391 }
15392 return 0;
15393}
15394
15395Py_UNICODE*
15396Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15397{
15398 const Py_UNICODE *p;
15399 for (p = s; *p; p++)
15400 if (*p == c)
15401 return (Py_UNICODE*)p;
15402 return NULL;
15403}
15404
15405Py_UNICODE*
15406Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15407{
15408 const Py_UNICODE *p;
15409 p = s + Py_UNICODE_strlen(s);
15410 while (p != s) {
15411 p--;
15412 if (*p == c)
15413 return (Py_UNICODE*)p;
15414 }
15415 return NULL;
15416}
Victor Stinner331ea922010-08-10 16:37:20 +000015417
Victor Stinner71133ff2010-09-01 23:43:53 +000015418Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015419PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015420{
Victor Stinner577db2c2011-10-11 22:12:48 +020015421 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015422 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015424 if (!PyUnicode_Check(unicode)) {
15425 PyErr_BadArgument();
15426 return NULL;
15427 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015428 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015429 if (u == NULL)
15430 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015431 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015432 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015433 PyErr_NoMemory();
15434 return NULL;
15435 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015436 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015437 size *= sizeof(Py_UNICODE);
15438 copy = PyMem_Malloc(size);
15439 if (copy == NULL) {
15440 PyErr_NoMemory();
15441 return NULL;
15442 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015443 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015444 return copy;
15445}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015446
Georg Brandl66c221e2010-10-14 07:04:07 +000015447/* A _string module, to export formatter_parser and formatter_field_name_split
15448 to the string.Formatter class implemented in Python. */
15449
15450static PyMethodDef _string_methods[] = {
15451 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15452 METH_O, PyDoc_STR("split the argument as a field name")},
15453 {"formatter_parser", (PyCFunction) formatter_parser,
15454 METH_O, PyDoc_STR("parse the argument as a format string")},
15455 {NULL, NULL}
15456};
15457
15458static struct PyModuleDef _string_module = {
15459 PyModuleDef_HEAD_INIT,
15460 "_string",
15461 PyDoc_STR("string helper module"),
15462 0,
15463 _string_methods,
15464 NULL,
15465 NULL,
15466 NULL,
15467 NULL
15468};
15469
15470PyMODINIT_FUNC
15471PyInit__string(void)
15472{
15473 return PyModule_Create(&_string_module);
15474}
15475
15476
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015477#ifdef __cplusplus
15478}
15479#endif