blob: e45f3d7c27b923c05d167860e0cc92ea5f23a6a3 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700207static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200321 }
322 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200323 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 }
325 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200326 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200327 }
328 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200329 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200330 }
331 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200332 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200333 }
334 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200335 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 }
337 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200338 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200339 }
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_OTHER;
341}
342
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300343/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000345Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000346PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000348#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 /* This is actually an illegal character, so it should
352 not be passed to unichr. */
353 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354#endif
355}
356
Victor Stinner910337b2011-10-03 03:20:16 +0200357#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200358int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100359_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200360{
361 PyASCIIObject *ascii;
362 unsigned int kind;
363
364 assert(PyUnicode_Check(op));
365
366 ascii = (PyASCIIObject *)op;
367 kind = ascii->state.kind;
368
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200370 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 assert(ascii->state.ready == 1);
372 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200375 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200376
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 if (ascii->state.compact == 1) {
378 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200379 assert(kind == PyUnicode_1BYTE_KIND
380 || kind == PyUnicode_2BYTE_KIND
381 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200384 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100385 }
386 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389 data = unicode->data.any;
390 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100391 assert(ascii->length == 0);
392 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 assert(ascii->state.compact == 0);
394 assert(ascii->state.ascii == 0);
395 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200397 assert(ascii->wstr != NULL);
398 assert(data == NULL);
399 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 }
401 else {
402 assert(kind == PyUnicode_1BYTE_KIND
403 || kind == PyUnicode_2BYTE_KIND
404 || kind == PyUnicode_4BYTE_KIND);
405 assert(ascii->state.compact == 0);
406 assert(ascii->state.ready == 1);
407 assert(data != NULL);
408 if (ascii->state.ascii) {
409 assert (compact->utf8 == data);
410 assert (compact->utf8_length == ascii->length);
411 }
412 else
413 assert (compact->utf8 != data);
414 }
415 }
416 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200417 if (
418#if SIZEOF_WCHAR_T == 2
419 kind == PyUnicode_2BYTE_KIND
420#else
421 kind == PyUnicode_4BYTE_KIND
422#endif
423 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 {
425 assert(ascii->wstr == data);
426 assert(compact->wstr_length == ascii->length);
427 } else
428 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200429 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200430
431 if (compact->utf8 == NULL)
432 assert(compact->utf8_length == 0);
433 if (ascii->wstr == NULL)
434 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200435 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200436 /* check that the best kind is used */
437 if (check_content && kind != PyUnicode_WCHAR_KIND)
438 {
439 Py_ssize_t i;
440 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200441 void *data;
442 Py_UCS4 ch;
443
444 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 for (i=0; i < ascii->length; i++)
446 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200447 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200448 if (ch > maxchar)
449 maxchar = ch;
450 }
451 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100452 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 255);
455 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200456 else
457 assert(maxchar < 128);
458 }
Victor Stinner77faf692011-11-20 18:56:05 +0100459 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200460 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100461 assert(maxchar <= 0xFFFF);
462 }
463 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100465 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200467 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200468 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400469 return 1;
470}
Victor Stinner910337b2011-10-03 03:20:16 +0200471#endif
472
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100473static PyObject*
474unicode_result_wchar(PyObject *unicode)
475{
476#ifndef Py_DEBUG
477 Py_ssize_t len;
478
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100479 len = _PyUnicode_WSTR_LENGTH(unicode);
480 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200482 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100483 }
484
485 if (len == 1) {
486 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100487 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489 Py_DECREF(unicode);
490 return latin1_char;
491 }
492 }
493
494 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200495 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 return NULL;
497 }
498#else
Victor Stinneraa771272012-10-04 02:32:58 +0200499 assert(Py_REFCNT(unicode) == 1);
500
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 /* don't make the result ready in debug mode to ensure that the caller
502 makes the string ready before using it */
503 assert(_PyUnicode_CheckConsistency(unicode, 1));
504#endif
505 return unicode;
506}
507
508static PyObject*
509unicode_result_ready(PyObject *unicode)
510{
511 Py_ssize_t length;
512
513 length = PyUnicode_GET_LENGTH(unicode);
514 if (length == 0) {
515 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100516 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200517 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 }
519 return unicode_empty;
520 }
521
522 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200523 void *data = PyUnicode_DATA(unicode);
524 int kind = PyUnicode_KIND(unicode);
525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 if (ch < 256) {
527 PyObject *latin1_char = unicode_latin1[ch];
528 if (latin1_char != NULL) {
529 if (unicode != latin1_char) {
530 Py_INCREF(latin1_char);
531 Py_DECREF(unicode);
532 }
533 return latin1_char;
534 }
535 else {
536 assert(_PyUnicode_CheckConsistency(unicode, 1));
537 Py_INCREF(unicode);
538 unicode_latin1[ch] = unicode;
539 return unicode;
540 }
541 }
542 }
543
544 assert(_PyUnicode_CheckConsistency(unicode, 1));
545 return unicode;
546}
547
548static PyObject*
549unicode_result(PyObject *unicode)
550{
551 assert(_PyUnicode_CHECK(unicode));
552 if (PyUnicode_IS_READY(unicode))
553 return unicode_result_ready(unicode);
554 else
555 return unicode_result_wchar(unicode);
556}
557
Victor Stinnerc4b49542011-12-11 22:44:26 +0100558static PyObject*
559unicode_result_unchanged(PyObject *unicode)
560{
561 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500562 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563 return NULL;
564 Py_INCREF(unicode);
565 return unicode;
566 }
567 else
568 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100569 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100570}
571
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573 ASCII, Latin1, UTF-8, etc. */
574static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200575backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577{
Victor Stinnerad771582015-10-09 12:38:53 +0200578 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200579 Py_UCS4 ch;
580 enum PyUnicode_Kind kind;
581 void *data;
582
583 assert(PyUnicode_IS_READY(unicode));
584 kind = PyUnicode_KIND(unicode);
585 data = PyUnicode_DATA(unicode);
586
587 size = 0;
588 /* determine replacement size */
589 for (i = collstart; i < collend; ++i) {
590 Py_ssize_t incr;
591
592 ch = PyUnicode_READ(kind, data, i);
593 if (ch < 0x100)
594 incr = 2+2;
595 else if (ch < 0x10000)
596 incr = 2+4;
597 else {
598 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200599 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200600 }
601 if (size > PY_SSIZE_T_MAX - incr) {
602 PyErr_SetString(PyExc_OverflowError,
603 "encoded result is too long for a Python string");
604 return NULL;
605 }
606 size += incr;
607 }
608
Victor Stinnerad771582015-10-09 12:38:53 +0200609 str = _PyBytesWriter_Prepare(writer, str, size);
610 if (str == NULL)
611 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200612
613 /* generate replacement */
614 for (i = collstart; i < collend; ++i) {
615 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200616 *str++ = '\\';
617 if (ch >= 0x00010000) {
618 *str++ = 'U';
619 *str++ = Py_hexdigits[(ch>>28)&0xf];
620 *str++ = Py_hexdigits[(ch>>24)&0xf];
621 *str++ = Py_hexdigits[(ch>>20)&0xf];
622 *str++ = Py_hexdigits[(ch>>16)&0xf];
623 *str++ = Py_hexdigits[(ch>>12)&0xf];
624 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 }
Victor Stinner797485e2015-10-09 03:17:30 +0200626 else if (ch >= 0x100) {
627 *str++ = 'u';
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
630 }
631 else
632 *str++ = 'x';
633 *str++ = Py_hexdigits[(ch>>4)&0xf];
634 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200635 }
636 return str;
637}
638
639/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640 ASCII, Latin1, UTF-8, etc. */
641static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200642xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644{
Victor Stinnerad771582015-10-09 12:38:53 +0200645 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 Py_UCS4 ch;
647 enum PyUnicode_Kind kind;
648 void *data;
649
650 assert(PyUnicode_IS_READY(unicode));
651 kind = PyUnicode_KIND(unicode);
652 data = PyUnicode_DATA(unicode);
653
654 size = 0;
655 /* determine replacement size */
656 for (i = collstart; i < collend; ++i) {
657 Py_ssize_t incr;
658
659 ch = PyUnicode_READ(kind, data, i);
660 if (ch < 10)
661 incr = 2+1+1;
662 else if (ch < 100)
663 incr = 2+2+1;
664 else if (ch < 1000)
665 incr = 2+3+1;
666 else if (ch < 10000)
667 incr = 2+4+1;
668 else if (ch < 100000)
669 incr = 2+5+1;
670 else if (ch < 1000000)
671 incr = 2+6+1;
672 else {
673 assert(ch <= MAX_UNICODE);
674 incr = 2+7+1;
675 }
676 if (size > PY_SSIZE_T_MAX - incr) {
677 PyErr_SetString(PyExc_OverflowError,
678 "encoded result is too long for a Python string");
679 return NULL;
680 }
681 size += incr;
682 }
683
Victor Stinnerad771582015-10-09 12:38:53 +0200684 str = _PyBytesWriter_Prepare(writer, str, size);
685 if (str == NULL)
686 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200687
688 /* generate replacement */
689 for (i = collstart; i < collend; ++i) {
690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691 }
692 return str;
693}
694
Thomas Wouters477c8d52006-05-27 19:21:47 +0000695/* --- Bloom Filters ----------------------------------------------------- */
696
697/* stuff to implement simple "bloom filters" for Unicode characters.
698 to keep things simple, we use a single bitmask, using the least 5
699 bits from each unicode characters as the bit index. */
700
701/* the linebreak mask is set up by Unicode_Init below */
702
Antoine Pitrouf068f942010-01-13 14:19:12 +0000703#if LONG_BIT >= 128
704#define BLOOM_WIDTH 128
705#elif LONG_BIT >= 64
706#define BLOOM_WIDTH 64
707#elif LONG_BIT >= 32
708#define BLOOM_WIDTH 32
709#else
710#error "LONG_BIT is smaller than 32"
711#endif
712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713#define BLOOM_MASK unsigned long
714
Serhiy Storchaka05997252013-01-26 12:14:02 +0200715static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000716
Antoine Pitrouf068f942010-01-13 14:19:12 +0000717#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718
Benjamin Peterson29060642009-01-31 22:14:21 +0000719#define BLOOM_LINEBREAK(ch) \
720 ((ch) < 128U ? ascii_linebreak[(ch)] : \
721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000722
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700723static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000725{
Victor Stinnera85af502013-04-09 21:53:54 +0200726#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
727 do { \
728 TYPE *data = (TYPE *)PTR; \
729 TYPE *end = data + LEN; \
730 Py_UCS4 ch; \
731 for (; data != end; data++) { \
732 ch = *data; \
733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734 } \
735 break; \
736 } while (0)
737
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738 /* calculate simple bloom-style bitmask for a given unicode string */
739
Antoine Pitrouf068f942010-01-13 14:19:12 +0000740 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741
742 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200743 switch (kind) {
744 case PyUnicode_1BYTE_KIND:
745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746 break;
747 case PyUnicode_2BYTE_KIND:
748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749 break;
750 case PyUnicode_4BYTE_KIND:
751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752 break;
753 default:
754 assert(0);
755 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000756 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200757
758#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759}
760
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300761static int
762ensure_unicode(PyObject *obj)
763{
764 if (!PyUnicode_Check(obj)) {
765 PyErr_Format(PyExc_TypeError,
766 "must be str, not %.100s",
767 Py_TYPE(obj)->tp_name);
768 return -1;
769 }
770 return PyUnicode_READY(obj);
771}
772
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773/* Compilation of templated routines */
774
775#include "stringlib/asciilib.h"
776#include "stringlib/fastsearch.h"
777#include "stringlib/partition.h"
778#include "stringlib/split.h"
779#include "stringlib/count.h"
780#include "stringlib/find.h"
781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs1lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300791#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs2lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs4lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200818#include "stringlib/unicodedefs.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100822#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824/* --- Unicode Object ----------------------------------------------------- */
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200827fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700829static inline Py_ssize_t
830findchar(const void *s, int kind,
831 Py_ssize_t size, Py_UCS4 ch,
832 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834 switch (kind) {
835 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200836 if ((Py_UCS1) ch != ch)
837 return -1;
838 if (direction > 0)
839 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
840 else
841 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200842 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200843 if ((Py_UCS2) ch != ch)
844 return -1;
845 if (direction > 0)
846 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
847 else
848 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200849 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200850 if (direction > 0)
851 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
852 else
853 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200854 default:
855 assert(0);
856 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858}
859
Victor Stinnerafffce42012-10-03 23:03:17 +0200860#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000861/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200862 earlier.
863
864 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
865 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
866 invalid character in Unicode 6.0. */
867static void
868unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
869{
870 int kind = PyUnicode_KIND(unicode);
871 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
872 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
873 if (length <= old_length)
874 return;
875 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
876}
877#endif
878
Victor Stinnerfe226c02011-10-03 03:52:20 +0200879static PyObject*
880resize_compact(PyObject *unicode, Py_ssize_t length)
881{
882 Py_ssize_t char_size;
883 Py_ssize_t struct_size;
884 Py_ssize_t new_size;
885 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100886 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200887#ifdef Py_DEBUG
888 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
889#endif
890
Victor Stinner79891572012-05-03 13:43:07 +0200891 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200892 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100893 assert(PyUnicode_IS_COMPACT(unicode));
894
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200895 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100896 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200897 struct_size = sizeof(PyASCIIObject);
898 else
899 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200900 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901
Victor Stinnerfe226c02011-10-03 03:52:20 +0200902 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
903 PyErr_NoMemory();
904 return NULL;
905 }
906 new_size = (struct_size + (length + 1) * char_size);
907
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200908 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
909 PyObject_DEL(_PyUnicode_UTF8(unicode));
910 _PyUnicode_UTF8(unicode) = NULL;
911 _PyUnicode_UTF8_LENGTH(unicode) = 0;
912 }
Victor Stinner84def372011-12-11 20:04:56 +0100913 _Py_DEC_REFTOTAL;
914 _Py_ForgetReference(unicode);
915
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300916 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100917 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100918 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200919 PyErr_NoMemory();
920 return NULL;
921 }
Victor Stinner84def372011-12-11 20:04:56 +0100922 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200923 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100924
Victor Stinnerfe226c02011-10-03 03:52:20 +0200925 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200926 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200927 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100928 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200929 _PyUnicode_WSTR_LENGTH(unicode) = length;
930 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100931 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
932 PyObject_DEL(_PyUnicode_WSTR(unicode));
933 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100934 if (!PyUnicode_IS_ASCII(unicode))
935 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100936 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200937#ifdef Py_DEBUG
938 unicode_fill_invalid(unicode, old_length);
939#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
941 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200942 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 return unicode;
944}
945
Alexander Belopolsky40018472011-02-26 01:02:56 +0000946static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200947resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948{
Victor Stinner95663112011-10-04 01:03:50 +0200949 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100950 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200952 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000953
Victor Stinnerfe226c02011-10-03 03:52:20 +0200954 if (PyUnicode_IS_READY(unicode)) {
955 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200956 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200958#ifdef Py_DEBUG
959 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
960#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961
962 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200963 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
965 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200966
967 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
968 PyErr_NoMemory();
969 return -1;
970 }
971 new_size = (length + 1) * char_size;
972
Victor Stinner7a9105a2011-12-12 00:13:42 +0100973 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
974 {
975 PyObject_DEL(_PyUnicode_UTF8(unicode));
976 _PyUnicode_UTF8(unicode) = NULL;
977 _PyUnicode_UTF8_LENGTH(unicode) = 0;
978 }
979
Victor Stinnerfe226c02011-10-03 03:52:20 +0200980 data = (PyObject *)PyObject_REALLOC(data, new_size);
981 if (data == NULL) {
982 PyErr_NoMemory();
983 return -1;
984 }
985 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200986 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200987 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200988 _PyUnicode_WSTR_LENGTH(unicode) = length;
989 }
990 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200991 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200992 _PyUnicode_UTF8_LENGTH(unicode) = length;
993 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 _PyUnicode_LENGTH(unicode) = length;
995 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 unicode_fill_invalid(unicode, old_length);
998#endif
Victor Stinner95663112011-10-04 01:03:50 +0200999 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001000 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 }
Victor Stinner95663112011-10-04 01:03:50 +02001004 assert(_PyUnicode_WSTR(unicode) != NULL);
1005
1006 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001007 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001008 PyErr_NoMemory();
1009 return -1;
1010 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001012 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001013 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001014 if (!wstr) {
1015 PyErr_NoMemory();
1016 return -1;
1017 }
1018 _PyUnicode_WSTR(unicode) = wstr;
1019 _PyUnicode_WSTR(unicode)[length] = 0;
1020 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001021 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022 return 0;
1023}
1024
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025static PyObject*
1026resize_copy(PyObject *unicode, Py_ssize_t length)
1027{
1028 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001031
Benjamin Petersonbac79492012-01-14 13:34:47 -05001032 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001033 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001034
1035 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1036 if (copy == NULL)
1037 return NULL;
1038
1039 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001040 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001042 }
1043 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001044 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001046 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 if (w == NULL)
1048 return NULL;
1049 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1050 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001051 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001052 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001053 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 }
1055}
1056
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001058 Ux0000 terminated; some code (e.g. new_identifier)
1059 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060
1061 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001062 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063
1064*/
1065
Alexander Belopolsky40018472011-02-26 01:02:56 +00001066static PyUnicodeObject *
1067_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001069 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001071
Thomas Wouters477c8d52006-05-27 19:21:47 +00001072 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073 if (length == 0 && unicode_empty != NULL) {
1074 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001075 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 }
1077
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001078 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001079 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001080 return (PyUnicodeObject *)PyErr_NoMemory();
1081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 if (length < 0) {
1083 PyErr_SetString(PyExc_SystemError,
1084 "Negative size passed to _PyUnicode_New");
1085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086 }
1087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1089 if (unicode == NULL)
1090 return NULL;
1091 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001092
1093 _PyUnicode_WSTR_LENGTH(unicode) = length;
1094 _PyUnicode_HASH(unicode) = -1;
1095 _PyUnicode_STATE(unicode).interned = 0;
1096 _PyUnicode_STATE(unicode).kind = 0;
1097 _PyUnicode_STATE(unicode).compact = 0;
1098 _PyUnicode_STATE(unicode).ready = 0;
1099 _PyUnicode_STATE(unicode).ascii = 0;
1100 _PyUnicode_DATA_ANY(unicode) = NULL;
1101 _PyUnicode_LENGTH(unicode) = 0;
1102 _PyUnicode_UTF8(unicode) = NULL;
1103 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1106 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001107 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001108 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001109 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001110 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111
Jeremy Hyltond8082792003-09-16 19:41:39 +00001112 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001113 * the caller fails before initializing str -- unicode_resize()
1114 * reads str[0], and the Keep-Alive optimization can keep memory
1115 * allocated for str alive across a call to unicode_dealloc(unicode).
1116 * We don't want unicode_resize to read uninitialized memory in
1117 * that case.
1118 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 _PyUnicode_WSTR(unicode)[0] = 0;
1120 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001121
Victor Stinner7931d9a2011-11-04 00:22:48 +01001122 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 return unicode;
1124}
1125
Victor Stinnerf42dc442011-10-02 23:33:16 +02001126static const char*
1127unicode_kind_name(PyObject *unicode)
1128{
Victor Stinner42dfd712011-10-03 14:41:45 +02001129 /* don't check consistency: unicode_kind_name() is called from
1130 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131 if (!PyUnicode_IS_COMPACT(unicode))
1132 {
1133 if (!PyUnicode_IS_READY(unicode))
1134 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001135 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001136 {
1137 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001138 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001139 return "legacy ascii";
1140 else
1141 return "legacy latin1";
1142 case PyUnicode_2BYTE_KIND:
1143 return "legacy UCS2";
1144 case PyUnicode_4BYTE_KIND:
1145 return "legacy UCS4";
1146 default:
1147 return "<legacy invalid kind>";
1148 }
1149 }
1150 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001151 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "ascii";
1155 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001156 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001157 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001158 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001159 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001160 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001161 default:
1162 return "<invalid compact kind>";
1163 }
1164}
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167/* Functions wrapping macros for use in debugger */
1168char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001169 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170}
1171
1172void *_PyUnicode_compact_data(void *unicode) {
1173 return _PyUnicode_COMPACT_DATA(unicode);
1174}
1175void *_PyUnicode_data(void *unicode){
1176 printf("obj %p\n", unicode);
1177 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1178 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1179 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1180 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1181 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1182 return PyUnicode_DATA(unicode);
1183}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001184
1185void
1186_PyUnicode_Dump(PyObject *op)
1187{
1188 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001189 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1190 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1191 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001192
Victor Stinnera849a4b2011-10-03 12:12:11 +02001193 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001194 {
1195 if (ascii->state.ascii)
1196 data = (ascii + 1);
1197 else
1198 data = (compact + 1);
1199 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001200 else
1201 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001202 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1203 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001204
Victor Stinnera849a4b2011-10-03 12:12:11 +02001205 if (ascii->wstr == data)
1206 printf("shared ");
1207 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001208
Victor Stinnera3b334d2011-10-03 13:53:37 +02001209 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001210 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001211 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1212 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001213 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1214 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001215 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001216 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001217}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218#endif
1219
1220PyObject *
1221PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1222{
1223 PyObject *obj;
1224 PyCompactUnicodeObject *unicode;
1225 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001226 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001227 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 Py_ssize_t char_size;
1229 Py_ssize_t struct_size;
1230
1231 /* Optimization for empty strings */
1232 if (size == 0 && unicode_empty != NULL) {
1233 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001234 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 }
1236
Victor Stinner9e9d6892011-10-04 01:02:02 +02001237 is_ascii = 0;
1238 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239 struct_size = sizeof(PyCompactUnicodeObject);
1240 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001241 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 char_size = 1;
1243 is_ascii = 1;
1244 struct_size = sizeof(PyASCIIObject);
1245 }
1246 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001247 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248 char_size = 1;
1249 }
1250 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001251 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 char_size = 2;
1253 if (sizeof(wchar_t) == 2)
1254 is_sharing = 1;
1255 }
1256 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001257 if (maxchar > MAX_UNICODE) {
1258 PyErr_SetString(PyExc_SystemError,
1259 "invalid maximum character passed to PyUnicode_New");
1260 return NULL;
1261 }
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 4;
1264 if (sizeof(wchar_t) == 4)
1265 is_sharing = 1;
1266 }
1267
1268 /* Ensure we won't overflow the size. */
1269 if (size < 0) {
1270 PyErr_SetString(PyExc_SystemError,
1271 "Negative size passed to PyUnicode_New");
1272 return NULL;
1273 }
1274 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1275 return PyErr_NoMemory();
1276
1277 /* Duplicated allocation code from _PyObject_New() instead of a call to
1278 * PyObject_New() so we are able to allocate space for the object and
1279 * it's data buffer.
1280 */
1281 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1282 if (obj == NULL)
1283 return PyErr_NoMemory();
1284 obj = PyObject_INIT(obj, &PyUnicode_Type);
1285 if (obj == NULL)
1286 return NULL;
1287
1288 unicode = (PyCompactUnicodeObject *)obj;
1289 if (is_ascii)
1290 data = ((PyASCIIObject*)obj) + 1;
1291 else
1292 data = unicode + 1;
1293 _PyUnicode_LENGTH(unicode) = size;
1294 _PyUnicode_HASH(unicode) = -1;
1295 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001296 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 _PyUnicode_STATE(unicode).compact = 1;
1298 _PyUnicode_STATE(unicode).ready = 1;
1299 _PyUnicode_STATE(unicode).ascii = is_ascii;
1300 if (is_ascii) {
1301 ((char*)data)[size] = 0;
1302 _PyUnicode_WSTR(unicode) = NULL;
1303 }
Victor Stinner8f825062012-04-27 13:55:39 +02001304 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 ((char*)data)[size] = 0;
1306 _PyUnicode_WSTR(unicode) = NULL;
1307 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001309 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 else {
1312 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001313 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001314 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001316 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 ((Py_UCS4*)data)[size] = 0;
1318 if (is_sharing) {
1319 _PyUnicode_WSTR_LENGTH(unicode) = size;
1320 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1321 }
1322 else {
1323 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1324 _PyUnicode_WSTR(unicode) = NULL;
1325 }
1326 }
Victor Stinner8f825062012-04-27 13:55:39 +02001327#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001328 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001329#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001330 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 return obj;
1332}
1333
1334#if SIZEOF_WCHAR_T == 2
1335/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1336 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001337 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338
1339 This function assumes that unicode can hold one more code point than wstr
1340 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001341static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001343 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344{
1345 const wchar_t *iter;
1346 Py_UCS4 *ucs4_out;
1347
Victor Stinner910337b2011-10-03 03:20:16 +02001348 assert(unicode != NULL);
1349 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1351 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1352
1353 for (iter = begin; iter < end; ) {
1354 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1355 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001356 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1357 && (iter+1) < end
1358 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 {
Victor Stinner551ac952011-11-29 22:58:13 +01001360 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 iter += 2;
1362 }
1363 else {
1364 *ucs4_out++ = *iter;
1365 iter++;
1366 }
1367 }
1368 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1369 _PyUnicode_GET_LENGTH(unicode)));
1370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371}
1372#endif
1373
Victor Stinnercd9950f2011-10-02 00:34:53 +02001374static int
Victor Stinner488fa492011-12-12 00:01:39 +01001375unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001376{
Victor Stinner488fa492011-12-12 00:01:39 +01001377 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001378 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001379 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001380 return -1;
1381 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001382 return 0;
1383}
1384
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001385static int
1386_copy_characters(PyObject *to, Py_ssize_t to_start,
1387 PyObject *from, Py_ssize_t from_start,
1388 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001390 unsigned int from_kind, to_kind;
1391 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392
Victor Stinneree4544c2012-05-09 22:24:08 +02001393 assert(0 <= how_many);
1394 assert(0 <= from_start);
1395 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001397 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001398 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399
Victor Stinnerd3f08822012-05-29 12:57:52 +02001400 assert(PyUnicode_Check(to));
1401 assert(PyUnicode_IS_READY(to));
1402 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1403
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001404 if (how_many == 0)
1405 return 0;
1406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001408 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001410 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411
Victor Stinnerf1852262012-06-16 16:38:26 +02001412#ifdef Py_DEBUG
1413 if (!check_maxchar
1414 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1415 {
1416 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1417 Py_UCS4 ch;
1418 Py_ssize_t i;
1419 for (i=0; i < how_many; i++) {
1420 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1421 assert(ch <= to_maxchar);
1422 }
1423 }
1424#endif
1425
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001426 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001427 if (check_maxchar
1428 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1429 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001430 /* Writing Latin-1 characters into an ASCII string requires to
1431 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001432 Py_UCS4 max_char;
1433 max_char = ucs1lib_find_max_char(from_data,
1434 (Py_UCS1*)from_data + how_many);
1435 if (max_char >= 128)
1436 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001437 }
Christian Heimesf051e432016-09-13 20:22:02 +02001438 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001439 (char*)from_data + from_kind * from_start,
1440 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001442 else if (from_kind == PyUnicode_1BYTE_KIND
1443 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001444 {
1445 _PyUnicode_CONVERT_BYTES(
1446 Py_UCS1, Py_UCS2,
1447 PyUnicode_1BYTE_DATA(from) + from_start,
1448 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1449 PyUnicode_2BYTE_DATA(to) + to_start
1450 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001451 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001452 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001453 && to_kind == PyUnicode_4BYTE_KIND)
1454 {
1455 _PyUnicode_CONVERT_BYTES(
1456 Py_UCS1, Py_UCS4,
1457 PyUnicode_1BYTE_DATA(from) + from_start,
1458 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1459 PyUnicode_4BYTE_DATA(to) + to_start
1460 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001461 }
1462 else if (from_kind == PyUnicode_2BYTE_KIND
1463 && to_kind == PyUnicode_4BYTE_KIND)
1464 {
1465 _PyUnicode_CONVERT_BYTES(
1466 Py_UCS2, Py_UCS4,
1467 PyUnicode_2BYTE_DATA(from) + from_start,
1468 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1469 PyUnicode_4BYTE_DATA(to) + to_start
1470 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001471 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001472 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001473 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1474
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001475 if (!check_maxchar) {
1476 if (from_kind == PyUnicode_2BYTE_KIND
1477 && to_kind == PyUnicode_1BYTE_KIND)
1478 {
1479 _PyUnicode_CONVERT_BYTES(
1480 Py_UCS2, Py_UCS1,
1481 PyUnicode_2BYTE_DATA(from) + from_start,
1482 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1483 PyUnicode_1BYTE_DATA(to) + to_start
1484 );
1485 }
1486 else if (from_kind == PyUnicode_4BYTE_KIND
1487 && to_kind == PyUnicode_1BYTE_KIND)
1488 {
1489 _PyUnicode_CONVERT_BYTES(
1490 Py_UCS4, Py_UCS1,
1491 PyUnicode_4BYTE_DATA(from) + from_start,
1492 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1493 PyUnicode_1BYTE_DATA(to) + to_start
1494 );
1495 }
1496 else if (from_kind == PyUnicode_4BYTE_KIND
1497 && to_kind == PyUnicode_2BYTE_KIND)
1498 {
1499 _PyUnicode_CONVERT_BYTES(
1500 Py_UCS4, Py_UCS2,
1501 PyUnicode_4BYTE_DATA(from) + from_start,
1502 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1503 PyUnicode_2BYTE_DATA(to) + to_start
1504 );
1505 }
1506 else {
1507 assert(0);
1508 return -1;
1509 }
1510 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001511 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001513 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001514 Py_ssize_t i;
1515
Victor Stinnera0702ab2011-09-29 14:14:38 +02001516 for (i=0; i < how_many; i++) {
1517 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001518 if (ch > to_maxchar)
1519 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001520 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1521 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001522 }
1523 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524 return 0;
1525}
1526
Victor Stinnerd3f08822012-05-29 12:57:52 +02001527void
1528_PyUnicode_FastCopyCharacters(
1529 PyObject *to, Py_ssize_t to_start,
1530 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001531{
1532 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1533}
1534
1535Py_ssize_t
1536PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1537 PyObject *from, Py_ssize_t from_start,
1538 Py_ssize_t how_many)
1539{
1540 int err;
1541
1542 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1543 PyErr_BadInternalCall();
1544 return -1;
1545 }
1546
Benjamin Petersonbac79492012-01-14 13:34:47 -05001547 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001549 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001550 return -1;
1551
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001552 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001553 PyErr_SetString(PyExc_IndexError, "string index out of range");
1554 return -1;
1555 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001556 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001557 PyErr_SetString(PyExc_IndexError, "string index out of range");
1558 return -1;
1559 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001560 if (how_many < 0) {
1561 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1562 return -1;
1563 }
1564 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1566 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001567 "Cannot write %zi characters at %zi "
1568 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001569 how_many, to_start, PyUnicode_GET_LENGTH(to));
1570 return -1;
1571 }
1572
1573 if (how_many == 0)
1574 return 0;
1575
Victor Stinner488fa492011-12-12 00:01:39 +01001576 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001577 return -1;
1578
1579 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1580 if (err) {
1581 PyErr_Format(PyExc_SystemError,
1582 "Cannot copy %s characters "
1583 "into a string of %s characters",
1584 unicode_kind_name(from),
1585 unicode_kind_name(to));
1586 return -1;
1587 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001588 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589}
1590
Victor Stinner17222162011-09-28 22:15:37 +02001591/* Find the maximum code point and count the number of surrogate pairs so a
1592 correct string length can be computed before converting a string to UCS4.
1593 This function counts single surrogates as a character and not as a pair.
1594
1595 Return 0 on success, or -1 on error. */
1596static int
1597find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1598 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599{
1600 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001601 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602
Victor Stinnerc53be962011-10-02 21:33:54 +02001603 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604 *num_surrogates = 0;
1605 *maxchar = 0;
1606
1607 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001608#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001609 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1610 && (iter+1) < end
1611 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1612 {
1613 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1614 ++(*num_surrogates);
1615 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616 }
1617 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001618#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001619 {
1620 ch = *iter;
1621 iter++;
1622 }
1623 if (ch > *maxchar) {
1624 *maxchar = ch;
1625 if (*maxchar > MAX_UNICODE) {
1626 PyErr_Format(PyExc_ValueError,
1627 "character U+%x is not in range [U+0000; U+10ffff]",
1628 ch);
1629 return -1;
1630 }
1631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632 }
1633 return 0;
1634}
1635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001636int
1637_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638{
1639 wchar_t *end;
1640 Py_UCS4 maxchar = 0;
1641 Py_ssize_t num_surrogates;
1642#if SIZEOF_WCHAR_T == 2
1643 Py_ssize_t length_wo_surrogates;
1644#endif
1645
Georg Brandl7597add2011-10-05 16:36:47 +02001646 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001647 strings were created using _PyObject_New() and where no canonical
1648 representation (the str field) has been set yet aka strings
1649 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001650 assert(_PyUnicode_CHECK(unicode));
1651 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001653 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001654 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001655 /* Actually, it should neither be interned nor be anything else: */
1656 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001659 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001660 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662
1663 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001664 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1665 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 PyErr_NoMemory();
1667 return -1;
1668 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001669 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001670 _PyUnicode_WSTR(unicode), end,
1671 PyUnicode_1BYTE_DATA(unicode));
1672 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1673 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1674 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1675 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001676 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001677 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001678 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 }
1680 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001681 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001682 _PyUnicode_UTF8(unicode) = NULL;
1683 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 }
1685 PyObject_FREE(_PyUnicode_WSTR(unicode));
1686 _PyUnicode_WSTR(unicode) = NULL;
1687 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1688 }
1689 /* In this case we might have to convert down from 4-byte native
1690 wchar_t to 2-byte unicode. */
1691 else if (maxchar < 65536) {
1692 assert(num_surrogates == 0 &&
1693 "FindMaxCharAndNumSurrogatePairs() messed up");
1694
Victor Stinner506f5922011-09-28 22:34:18 +02001695#if SIZEOF_WCHAR_T == 2
1696 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001697 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001698 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1699 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1700 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001701 _PyUnicode_UTF8(unicode) = NULL;
1702 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001703#else
1704 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001705 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001706 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001707 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001708 PyErr_NoMemory();
1709 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 }
Victor Stinner506f5922011-09-28 22:34:18 +02001711 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1712 _PyUnicode_WSTR(unicode), end,
1713 PyUnicode_2BYTE_DATA(unicode));
1714 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1715 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1716 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001717 _PyUnicode_UTF8(unicode) = NULL;
1718 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001719 PyObject_FREE(_PyUnicode_WSTR(unicode));
1720 _PyUnicode_WSTR(unicode) = NULL;
1721 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1722#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 }
1724 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1725 else {
1726#if SIZEOF_WCHAR_T == 2
1727 /* in case the native representation is 2-bytes, we need to allocate a
1728 new normalized 4-byte version. */
1729 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001730 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1731 PyErr_NoMemory();
1732 return -1;
1733 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001734 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1735 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 PyErr_NoMemory();
1737 return -1;
1738 }
1739 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1740 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001741 _PyUnicode_UTF8(unicode) = NULL;
1742 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001743 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1744 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001745 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746 PyObject_FREE(_PyUnicode_WSTR(unicode));
1747 _PyUnicode_WSTR(unicode) = NULL;
1748 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1749#else
1750 assert(num_surrogates == 0);
1751
Victor Stinnerc3c74152011-10-02 20:39:55 +02001752 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001754 _PyUnicode_UTF8(unicode) = NULL;
1755 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1757#endif
1758 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1759 }
1760 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001761 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 return 0;
1763}
1764
Alexander Belopolsky40018472011-02-26 01:02:56 +00001765static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001766unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767{
Walter Dörwald16807132007-05-25 13:52:07 +00001768 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001769 case SSTATE_NOT_INTERNED:
1770 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001771
Benjamin Peterson29060642009-01-31 22:14:21 +00001772 case SSTATE_INTERNED_MORTAL:
1773 /* revive dead object temporarily for DelItem */
1774 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001775 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001776 Py_FatalError(
1777 "deletion of interned string failed");
1778 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001779
Benjamin Peterson29060642009-01-31 22:14:21 +00001780 case SSTATE_INTERNED_IMMORTAL:
1781 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001782
Benjamin Peterson29060642009-01-31 22:14:21 +00001783 default:
1784 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001785 }
1786
Victor Stinner03490912011-10-03 23:45:12 +02001787 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001789 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001790 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001791 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1792 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001794 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795}
1796
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001797#ifdef Py_DEBUG
1798static int
1799unicode_is_singleton(PyObject *unicode)
1800{
1801 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1802 if (unicode == unicode_empty)
1803 return 1;
1804 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1805 {
1806 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1807 if (ch < 256 && unicode_latin1[ch] == unicode)
1808 return 1;
1809 }
1810 return 0;
1811}
1812#endif
1813
Alexander Belopolsky40018472011-02-26 01:02:56 +00001814static int
Victor Stinner488fa492011-12-12 00:01:39 +01001815unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001816{
Victor Stinner488fa492011-12-12 00:01:39 +01001817 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001818 if (Py_REFCNT(unicode) != 1)
1819 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001820 if (_PyUnicode_HASH(unicode) != -1)
1821 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001822 if (PyUnicode_CHECK_INTERNED(unicode))
1823 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001824 if (!PyUnicode_CheckExact(unicode))
1825 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001826#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001827 /* singleton refcount is greater than 1 */
1828 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001829#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001830 return 1;
1831}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001832
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833static int
1834unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1835{
1836 PyObject *unicode;
1837 Py_ssize_t old_length;
1838
1839 assert(p_unicode != NULL);
1840 unicode = *p_unicode;
1841
1842 assert(unicode != NULL);
1843 assert(PyUnicode_Check(unicode));
1844 assert(0 <= length);
1845
Victor Stinner910337b2011-10-03 03:20:16 +02001846 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001847 old_length = PyUnicode_WSTR_LENGTH(unicode);
1848 else
1849 old_length = PyUnicode_GET_LENGTH(unicode);
1850 if (old_length == length)
1851 return 0;
1852
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001853 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001854 _Py_INCREF_UNICODE_EMPTY();
1855 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001856 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001857 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001858 return 0;
1859 }
1860
Victor Stinner488fa492011-12-12 00:01:39 +01001861 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 PyObject *copy = resize_copy(unicode, length);
1863 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001864 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001865 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001866 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001867 }
1868
Victor Stinnerfe226c02011-10-03 03:52:20 +02001869 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001870 PyObject *new_unicode = resize_compact(unicode, length);
1871 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001872 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001873 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001874 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001875 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001876 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001877}
1878
Alexander Belopolsky40018472011-02-26 01:02:56 +00001879int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001880PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001881{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001882 PyObject *unicode;
1883 if (p_unicode == NULL) {
1884 PyErr_BadInternalCall();
1885 return -1;
1886 }
1887 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001888 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 {
1890 PyErr_BadInternalCall();
1891 return -1;
1892 }
1893 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001894}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001895
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001896/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001897
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001898 WARNING: The function doesn't copy the terminating null character and
1899 doesn't check the maximum character (may write a latin1 character in an
1900 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001901static void
1902unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1903 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001904{
1905 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1906 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001907 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001908
1909 switch (kind) {
1910 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001911 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001912#ifdef Py_DEBUG
1913 if (PyUnicode_IS_ASCII(unicode)) {
1914 Py_UCS4 maxchar = ucs1lib_find_max_char(
1915 (const Py_UCS1*)str,
1916 (const Py_UCS1*)str + len);
1917 assert(maxchar < 128);
1918 }
1919#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001920 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001921 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001922 }
1923 case PyUnicode_2BYTE_KIND: {
1924 Py_UCS2 *start = (Py_UCS2 *)data + index;
1925 Py_UCS2 *ucs2 = start;
1926 assert(index <= PyUnicode_GET_LENGTH(unicode));
1927
Victor Stinner184252a2012-06-16 02:57:41 +02001928 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001929 *ucs2 = (Py_UCS2)*str;
1930
1931 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001932 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001933 }
1934 default: {
1935 Py_UCS4 *start = (Py_UCS4 *)data + index;
1936 Py_UCS4 *ucs4 = start;
1937 assert(kind == PyUnicode_4BYTE_KIND);
1938 assert(index <= PyUnicode_GET_LENGTH(unicode));
1939
Victor Stinner184252a2012-06-16 02:57:41 +02001940 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001941 *ucs4 = (Py_UCS4)*str;
1942
1943 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001944 }
1945 }
1946}
1947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948static PyObject*
1949get_latin1_char(unsigned char ch)
1950{
Victor Stinnera464fc12011-10-02 20:39:30 +02001951 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001953 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 if (!unicode)
1955 return NULL;
1956 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001957 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 unicode_latin1[ch] = unicode;
1959 }
1960 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001961 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001962}
1963
Victor Stinner985a82a2014-01-03 12:53:47 +01001964static PyObject*
1965unicode_char(Py_UCS4 ch)
1966{
1967 PyObject *unicode;
1968
1969 assert(ch <= MAX_UNICODE);
1970
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001971 if (ch < 256)
1972 return get_latin1_char(ch);
1973
Victor Stinner985a82a2014-01-03 12:53:47 +01001974 unicode = PyUnicode_New(1, ch);
1975 if (unicode == NULL)
1976 return NULL;
1977 switch (PyUnicode_KIND(unicode)) {
1978 case PyUnicode_1BYTE_KIND:
1979 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1980 break;
1981 case PyUnicode_2BYTE_KIND:
1982 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1983 break;
1984 default:
1985 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1986 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1987 }
1988 assert(_PyUnicode_CheckConsistency(unicode, 1));
1989 return unicode;
1990}
1991
Alexander Belopolsky40018472011-02-26 01:02:56 +00001992PyObject *
1993PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001995 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 Py_UCS4 maxchar = 0;
1997 Py_ssize_t num_surrogates;
1998
1999 if (u == NULL)
2000 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002002 /* If the Unicode data is known at construction time, we can apply
2003 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002006 if (size == 0)
2007 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 /* Single character Unicode objects in the Latin-1 range are
2010 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002011 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 return get_latin1_char((unsigned char)*u);
2013
2014 /* If not empty and not single character, copy the Unicode data
2015 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002016 if (find_maxchar_surrogates(u, u + size,
2017 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018 return NULL;
2019
Victor Stinner8faf8212011-12-08 22:14:11 +01002020 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 if (!unicode)
2022 return NULL;
2023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 switch (PyUnicode_KIND(unicode)) {
2025 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002026 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2028 break;
2029 case PyUnicode_2BYTE_KIND:
2030#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002031 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002033 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2035#endif
2036 break;
2037 case PyUnicode_4BYTE_KIND:
2038#if SIZEOF_WCHAR_T == 2
2039 /* This is the only case which has to process surrogates, thus
2040 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002041 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042#else
2043 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002044 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045#endif
2046 break;
2047 default:
2048 assert(0 && "Impossible state");
2049 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002051 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052}
2053
Alexander Belopolsky40018472011-02-26 01:02:56 +00002054PyObject *
2055PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002056{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002057 if (size < 0) {
2058 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002059 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002060 return NULL;
2061 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002062 if (u != NULL)
2063 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2064 else
2065 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002066}
2067
Alexander Belopolsky40018472011-02-26 01:02:56 +00002068PyObject *
2069PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002070{
2071 size_t size = strlen(u);
2072 if (size > PY_SSIZE_T_MAX) {
2073 PyErr_SetString(PyExc_OverflowError, "input too long");
2074 return NULL;
2075 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002076 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002077}
2078
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002079PyObject *
2080_PyUnicode_FromId(_Py_Identifier *id)
2081{
2082 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002083 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2084 strlen(id->string),
2085 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002086 if (!id->object)
2087 return NULL;
2088 PyUnicode_InternInPlace(&id->object);
2089 assert(!id->next);
2090 id->next = static_strings;
2091 static_strings = id;
2092 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002093 return id->object;
2094}
2095
2096void
2097_PyUnicode_ClearStaticStrings()
2098{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002099 _Py_Identifier *tmp, *s = static_strings;
2100 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002101 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002102 tmp = s->next;
2103 s->next = NULL;
2104 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002105 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002106 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002107}
2108
Benjamin Peterson0df54292012-03-26 14:50:32 -04002109/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002110
Victor Stinnerd3f08822012-05-29 12:57:52 +02002111PyObject*
2112_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002113{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002114 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002115 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002116 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002117#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002118 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002119#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002120 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002121 }
Victor Stinner785938e2011-12-11 20:09:03 +01002122 unicode = PyUnicode_New(size, 127);
2123 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002124 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002125 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2126 assert(_PyUnicode_CheckConsistency(unicode, 1));
2127 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002128}
2129
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002130static Py_UCS4
2131kind_maxchar_limit(unsigned int kind)
2132{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002133 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002134 case PyUnicode_1BYTE_KIND:
2135 return 0x80;
2136 case PyUnicode_2BYTE_KIND:
2137 return 0x100;
2138 case PyUnicode_4BYTE_KIND:
2139 return 0x10000;
2140 default:
2141 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002142 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002143 }
2144}
2145
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002146static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002147align_maxchar(Py_UCS4 maxchar)
2148{
2149 if (maxchar <= 127)
2150 return 127;
2151 else if (maxchar <= 255)
2152 return 255;
2153 else if (maxchar <= 65535)
2154 return 65535;
2155 else
2156 return MAX_UNICODE;
2157}
2158
Victor Stinner702c7342011-10-05 13:50:52 +02002159static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002160_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002163 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002164
Serhiy Storchaka678db842013-01-26 12:16:36 +02002165 if (size == 0)
2166 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002167 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002168 if (size == 1)
2169 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002170
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002171 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002172 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 if (!res)
2174 return NULL;
2175 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002176 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002178}
2179
Victor Stinnere57b1c02011-09-28 22:20:48 +02002180static PyObject*
2181_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182{
2183 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002184 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002185
Serhiy Storchaka678db842013-01-26 12:16:36 +02002186 if (size == 0)
2187 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002188 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002189 if (size == 1)
2190 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002191
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002192 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002193 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 if (!res)
2195 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002196 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002198 else {
2199 _PyUnicode_CONVERT_BYTES(
2200 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2201 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002202 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 return res;
2204}
2205
Victor Stinnere57b1c02011-09-28 22:20:48 +02002206static PyObject*
2207_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208{
2209 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002210 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002211
Serhiy Storchaka678db842013-01-26 12:16:36 +02002212 if (size == 0)
2213 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002214 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002215 if (size == 1)
2216 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002217
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002218 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002219 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 if (!res)
2221 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002222 if (max_char < 256)
2223 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2224 PyUnicode_1BYTE_DATA(res));
2225 else if (max_char < 0x10000)
2226 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2227 PyUnicode_2BYTE_DATA(res));
2228 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002230 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 return res;
2232}
2233
2234PyObject*
2235PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2236{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002237 if (size < 0) {
2238 PyErr_SetString(PyExc_ValueError, "size must be positive");
2239 return NULL;
2240 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002241 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002243 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002245 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002247 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002248 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002249 PyErr_SetString(PyExc_SystemError, "invalid kind");
2250 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252}
2253
Victor Stinnerece58de2012-04-23 23:36:38 +02002254Py_UCS4
2255_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2256{
2257 enum PyUnicode_Kind kind;
2258 void *startptr, *endptr;
2259
2260 assert(PyUnicode_IS_READY(unicode));
2261 assert(0 <= start);
2262 assert(end <= PyUnicode_GET_LENGTH(unicode));
2263 assert(start <= end);
2264
2265 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2266 return PyUnicode_MAX_CHAR_VALUE(unicode);
2267
2268 if (start == end)
2269 return 127;
2270
Victor Stinner94d558b2012-04-27 22:26:58 +02002271 if (PyUnicode_IS_ASCII(unicode))
2272 return 127;
2273
Victor Stinnerece58de2012-04-23 23:36:38 +02002274 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002275 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002276 endptr = (char *)startptr + end * kind;
2277 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002278 switch(kind) {
2279 case PyUnicode_1BYTE_KIND:
2280 return ucs1lib_find_max_char(startptr, endptr);
2281 case PyUnicode_2BYTE_KIND:
2282 return ucs2lib_find_max_char(startptr, endptr);
2283 case PyUnicode_4BYTE_KIND:
2284 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002285 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002286 assert(0);
2287 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002288 }
2289}
2290
Victor Stinner25a4b292011-10-06 12:31:55 +02002291/* Ensure that a string uses the most efficient storage, if it is not the
2292 case: create a new string with of the right kind. Write NULL into *p_unicode
2293 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002294static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002295unicode_adjust_maxchar(PyObject **p_unicode)
2296{
2297 PyObject *unicode, *copy;
2298 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002299 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002300 unsigned int kind;
2301
2302 assert(p_unicode != NULL);
2303 unicode = *p_unicode;
2304 assert(PyUnicode_IS_READY(unicode));
2305 if (PyUnicode_IS_ASCII(unicode))
2306 return;
2307
2308 len = PyUnicode_GET_LENGTH(unicode);
2309 kind = PyUnicode_KIND(unicode);
2310 if (kind == PyUnicode_1BYTE_KIND) {
2311 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002312 max_char = ucs1lib_find_max_char(u, u + len);
2313 if (max_char >= 128)
2314 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002315 }
2316 else if (kind == PyUnicode_2BYTE_KIND) {
2317 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002318 max_char = ucs2lib_find_max_char(u, u + len);
2319 if (max_char >= 256)
2320 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002321 }
2322 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002323 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002324 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002325 max_char = ucs4lib_find_max_char(u, u + len);
2326 if (max_char >= 0x10000)
2327 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002328 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002329 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002330 if (copy != NULL)
2331 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002332 Py_DECREF(unicode);
2333 *p_unicode = copy;
2334}
2335
Victor Stinner034f6cf2011-09-30 02:26:44 +02002336PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002337_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002338{
Victor Stinner87af4f22011-11-21 23:03:47 +01002339 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002340 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002341
Victor Stinner034f6cf2011-09-30 02:26:44 +02002342 if (!PyUnicode_Check(unicode)) {
2343 PyErr_BadInternalCall();
2344 return NULL;
2345 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002346 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002347 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002348
Victor Stinner87af4f22011-11-21 23:03:47 +01002349 length = PyUnicode_GET_LENGTH(unicode);
2350 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002351 if (!copy)
2352 return NULL;
2353 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2354
Christian Heimesf051e432016-09-13 20:22:02 +02002355 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002356 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002357 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002358 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002359}
2360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361
Victor Stinnerbc603d12011-10-02 01:00:40 +02002362/* Widen Unicode objects to larger buffers. Don't write terminating null
2363 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002364
2365void*
2366_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2367{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002368 Py_ssize_t len;
2369 void *result;
2370 unsigned int skind;
2371
Benjamin Petersonbac79492012-01-14 13:34:47 -05002372 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002373 return NULL;
2374
2375 len = PyUnicode_GET_LENGTH(s);
2376 skind = PyUnicode_KIND(s);
2377 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002378 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002379 return NULL;
2380 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002381 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002382 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002383 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002384 if (!result)
2385 return PyErr_NoMemory();
2386 assert(skind == PyUnicode_1BYTE_KIND);
2387 _PyUnicode_CONVERT_BYTES(
2388 Py_UCS1, Py_UCS2,
2389 PyUnicode_1BYTE_DATA(s),
2390 PyUnicode_1BYTE_DATA(s) + len,
2391 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002393 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002394 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002395 if (!result)
2396 return PyErr_NoMemory();
2397 if (skind == PyUnicode_2BYTE_KIND) {
2398 _PyUnicode_CONVERT_BYTES(
2399 Py_UCS2, Py_UCS4,
2400 PyUnicode_2BYTE_DATA(s),
2401 PyUnicode_2BYTE_DATA(s) + len,
2402 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002404 else {
2405 assert(skind == PyUnicode_1BYTE_KIND);
2406 _PyUnicode_CONVERT_BYTES(
2407 Py_UCS1, Py_UCS4,
2408 PyUnicode_1BYTE_DATA(s),
2409 PyUnicode_1BYTE_DATA(s) + len,
2410 result);
2411 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002413 default:
2414 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 }
Victor Stinner01698042011-10-04 00:04:26 +02002416 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 return NULL;
2418}
2419
2420static Py_UCS4*
2421as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2422 int copy_null)
2423{
2424 int kind;
2425 void *data;
2426 Py_ssize_t len, targetlen;
2427 if (PyUnicode_READY(string) == -1)
2428 return NULL;
2429 kind = PyUnicode_KIND(string);
2430 data = PyUnicode_DATA(string);
2431 len = PyUnicode_GET_LENGTH(string);
2432 targetlen = len;
2433 if (copy_null)
2434 targetlen++;
2435 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002436 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437 if (!target) {
2438 PyErr_NoMemory();
2439 return NULL;
2440 }
2441 }
2442 else {
2443 if (targetsize < targetlen) {
2444 PyErr_Format(PyExc_SystemError,
2445 "string is longer than the buffer");
2446 if (copy_null && 0 < targetsize)
2447 target[0] = 0;
2448 return NULL;
2449 }
2450 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002451 if (kind == PyUnicode_1BYTE_KIND) {
2452 Py_UCS1 *start = (Py_UCS1 *) data;
2453 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002455 else if (kind == PyUnicode_2BYTE_KIND) {
2456 Py_UCS2 *start = (Py_UCS2 *) data;
2457 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2458 }
2459 else {
2460 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002461 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 if (copy_null)
2464 target[len] = 0;
2465 return target;
2466}
2467
2468Py_UCS4*
2469PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2470 int copy_null)
2471{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002472 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 PyErr_BadInternalCall();
2474 return NULL;
2475 }
2476 return as_ucs4(string, target, targetsize, copy_null);
2477}
2478
2479Py_UCS4*
2480PyUnicode_AsUCS4Copy(PyObject *string)
2481{
2482 return as_ucs4(string, NULL, 0, 1);
2483}
2484
2485#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002486
Alexander Belopolsky40018472011-02-26 01:02:56 +00002487PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002488PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002492 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002493 PyErr_BadInternalCall();
2494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 }
2496
Martin v. Löwis790465f2008-04-05 20:41:37 +00002497 if (size == -1) {
2498 size = wcslen(w);
2499 }
2500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002501 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502}
2503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002505
Victor Stinner15a11362012-10-06 23:48:20 +02002506/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002507 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2508 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2509#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002510
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002511static int
2512unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2513 Py_ssize_t width, Py_ssize_t precision)
2514{
2515 Py_ssize_t length, fill, arglen;
2516 Py_UCS4 maxchar;
2517
2518 if (PyUnicode_READY(str) == -1)
2519 return -1;
2520
2521 length = PyUnicode_GET_LENGTH(str);
2522 if ((precision == -1 || precision >= length)
2523 && width <= length)
2524 return _PyUnicodeWriter_WriteStr(writer, str);
2525
2526 if (precision != -1)
2527 length = Py_MIN(precision, length);
2528
2529 arglen = Py_MAX(length, width);
2530 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2531 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2532 else
2533 maxchar = writer->maxchar;
2534
2535 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2536 return -1;
2537
2538 if (width > length) {
2539 fill = width - length;
2540 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2541 return -1;
2542 writer->pos += fill;
2543 }
2544
2545 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2546 str, 0, length);
2547 writer->pos += length;
2548 return 0;
2549}
2550
2551static int
2552unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2553 Py_ssize_t width, Py_ssize_t precision)
2554{
2555 /* UTF-8 */
2556 Py_ssize_t length;
2557 PyObject *unicode;
2558 int res;
2559
2560 length = strlen(str);
2561 if (precision != -1)
2562 length = Py_MIN(length, precision);
2563 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2564 if (unicode == NULL)
2565 return -1;
2566
2567 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2568 Py_DECREF(unicode);
2569 return res;
2570}
2571
Victor Stinner96865452011-03-01 23:44:09 +00002572static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002573unicode_fromformat_arg(_PyUnicodeWriter *writer,
2574 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002575{
Victor Stinnere215d962012-10-06 23:03:36 +02002576 const char *p;
2577 Py_ssize_t len;
2578 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002579 Py_ssize_t width;
2580 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002581 int longflag;
2582 int longlongflag;
2583 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002584 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002585
2586 p = f;
2587 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002588 zeropad = 0;
2589 if (*f == '0') {
2590 zeropad = 1;
2591 f++;
2592 }
Victor Stinner96865452011-03-01 23:44:09 +00002593
2594 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 width = -1;
2596 if (Py_ISDIGIT((unsigned)*f)) {
2597 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002598 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002599 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002600 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002601 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002602 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002603 return NULL;
2604 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002605 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002606 f++;
2607 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002608 }
2609 precision = -1;
2610 if (*f == '.') {
2611 f++;
2612 if (Py_ISDIGIT((unsigned)*f)) {
2613 precision = (*f - '0');
2614 f++;
2615 while (Py_ISDIGIT((unsigned)*f)) {
2616 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2617 PyErr_SetString(PyExc_ValueError,
2618 "precision too big");
2619 return NULL;
2620 }
2621 precision = (precision * 10) + (*f - '0');
2622 f++;
2623 }
2624 }
Victor Stinner96865452011-03-01 23:44:09 +00002625 if (*f == '%') {
2626 /* "%.3%s" => f points to "3" */
2627 f--;
2628 }
2629 }
2630 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002631 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002632 f--;
2633 }
Victor Stinner96865452011-03-01 23:44:09 +00002634
2635 /* Handle %ld, %lu, %lld and %llu. */
2636 longflag = 0;
2637 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002638 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002639 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002640 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002641 longflag = 1;
2642 ++f;
2643 }
Victor Stinner96865452011-03-01 23:44:09 +00002644 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002645 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002646 longlongflag = 1;
2647 f += 2;
2648 }
Victor Stinner96865452011-03-01 23:44:09 +00002649 }
2650 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002651 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002652 size_tflag = 1;
2653 ++f;
2654 }
Victor Stinnere215d962012-10-06 23:03:36 +02002655
2656 if (f[1] == '\0')
2657 writer->overallocate = 0;
2658
2659 switch (*f) {
2660 case 'c':
2661 {
2662 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002663 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002664 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002665 "character argument not in range(0x110000)");
2666 return NULL;
2667 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002668 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002669 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002670 break;
2671 }
2672
2673 case 'i':
2674 case 'd':
2675 case 'u':
2676 case 'x':
2677 {
2678 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002679 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002680 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002681
2682 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002683 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002684 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002685 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002686 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002687 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002688 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002689 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002690 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002691 va_arg(*vargs, size_t));
2692 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002693 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002694 va_arg(*vargs, unsigned int));
2695 }
2696 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002697 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002698 }
2699 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002700 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002701 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002702 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002703 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002704 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002705 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002706 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002707 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002708 va_arg(*vargs, Py_ssize_t));
2709 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002710 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002711 va_arg(*vargs, int));
2712 }
2713 assert(len >= 0);
2714
Victor Stinnere215d962012-10-06 23:03:36 +02002715 if (precision < len)
2716 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002717
2718 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002719 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2720 return NULL;
2721
Victor Stinnere215d962012-10-06 23:03:36 +02002722 if (width > precision) {
2723 Py_UCS4 fillchar;
2724 fill = width - precision;
2725 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002726 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2727 return NULL;
2728 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002729 }
Victor Stinner15a11362012-10-06 23:48:20 +02002730 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002731 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002732 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2733 return NULL;
2734 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002735 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002736
Victor Stinner4a587072013-11-19 12:54:53 +01002737 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2738 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002739 break;
2740 }
2741
2742 case 'p':
2743 {
2744 char number[MAX_LONG_LONG_CHARS];
2745
2746 len = sprintf(number, "%p", va_arg(*vargs, void*));
2747 assert(len >= 0);
2748
2749 /* %p is ill-defined: ensure leading 0x. */
2750 if (number[1] == 'X')
2751 number[1] = 'x';
2752 else if (number[1] != 'x') {
2753 memmove(number + 2, number,
2754 strlen(number) + 1);
2755 number[0] = '0';
2756 number[1] = 'x';
2757 len += 2;
2758 }
2759
Victor Stinner4a587072013-11-19 12:54:53 +01002760 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002761 return NULL;
2762 break;
2763 }
2764
2765 case 's':
2766 {
2767 /* UTF-8 */
2768 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002769 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002770 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002771 break;
2772 }
2773
2774 case 'U':
2775 {
2776 PyObject *obj = va_arg(*vargs, PyObject *);
2777 assert(obj && _PyUnicode_CHECK(obj));
2778
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002779 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002780 return NULL;
2781 break;
2782 }
2783
2784 case 'V':
2785 {
2786 PyObject *obj = va_arg(*vargs, PyObject *);
2787 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002788 if (obj) {
2789 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
2792 }
2793 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002794 assert(str != NULL);
2795 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002796 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002797 }
2798 break;
2799 }
2800
2801 case 'S':
2802 {
2803 PyObject *obj = va_arg(*vargs, PyObject *);
2804 PyObject *str;
2805 assert(obj);
2806 str = PyObject_Str(obj);
2807 if (!str)
2808 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002809 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002810 Py_DECREF(str);
2811 return NULL;
2812 }
2813 Py_DECREF(str);
2814 break;
2815 }
2816
2817 case 'R':
2818 {
2819 PyObject *obj = va_arg(*vargs, PyObject *);
2820 PyObject *repr;
2821 assert(obj);
2822 repr = PyObject_Repr(obj);
2823 if (!repr)
2824 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002825 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002826 Py_DECREF(repr);
2827 return NULL;
2828 }
2829 Py_DECREF(repr);
2830 break;
2831 }
2832
2833 case 'A':
2834 {
2835 PyObject *obj = va_arg(*vargs, PyObject *);
2836 PyObject *ascii;
2837 assert(obj);
2838 ascii = PyObject_ASCII(obj);
2839 if (!ascii)
2840 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002841 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002842 Py_DECREF(ascii);
2843 return NULL;
2844 }
2845 Py_DECREF(ascii);
2846 break;
2847 }
2848
2849 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002850 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002851 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002852 break;
2853
2854 default:
2855 /* if we stumble upon an unknown formatting code, copy the rest
2856 of the format string to the output string. (we cannot just
2857 skip the code, since there's no way to know what's in the
2858 argument list) */
2859 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002860 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002861 return NULL;
2862 f = p+len;
2863 return f;
2864 }
2865
2866 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002867 return f;
2868}
2869
Walter Dörwaldd2034312007-05-18 16:29:38 +00002870PyObject *
2871PyUnicode_FromFormatV(const char *format, va_list vargs)
2872{
Victor Stinnere215d962012-10-06 23:03:36 +02002873 va_list vargs2;
2874 const char *f;
2875 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002876
Victor Stinner8f674cc2013-04-17 23:02:17 +02002877 _PyUnicodeWriter_Init(&writer);
2878 writer.min_length = strlen(format) + 100;
2879 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002880
Benjamin Peterson0c212142016-09-20 20:39:33 -07002881 // Copy varags to be able to pass a reference to a subfunction.
2882 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002883
2884 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002885 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002886 f = unicode_fromformat_arg(&writer, f, &vargs2);
2887 if (f == NULL)
2888 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002890 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002891 const char *p;
2892 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002893
Victor Stinnere215d962012-10-06 23:03:36 +02002894 p = f;
2895 do
2896 {
2897 if ((unsigned char)*p > 127) {
2898 PyErr_Format(PyExc_ValueError,
2899 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2900 "string, got a non-ASCII byte: 0x%02x",
2901 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002902 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002903 }
2904 p++;
2905 }
2906 while (*p != '\0' && *p != '%');
2907 len = p - f;
2908
2909 if (*p == '\0')
2910 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002911
2912 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002913 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002914
2915 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002916 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002917 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002918 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002919 return _PyUnicodeWriter_Finish(&writer);
2920
2921 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002922 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002923 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002924 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002925}
2926
Walter Dörwaldd2034312007-05-18 16:29:38 +00002927PyObject *
2928PyUnicode_FromFormat(const char *format, ...)
2929{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002930 PyObject* ret;
2931 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002932
2933#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002934 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002935#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002936 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002937#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002938 ret = PyUnicode_FromFormatV(format, vargs);
2939 va_end(vargs);
2940 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002941}
2942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002943#ifdef HAVE_WCHAR_H
2944
Victor Stinner5593d8a2010-10-02 11:11:27 +00002945/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2946 convert a Unicode object to a wide character string.
2947
Victor Stinnerd88d9832011-09-06 02:00:05 +02002948 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002949 character) required to convert the unicode object. Ignore size argument.
2950
Victor Stinnerd88d9832011-09-06 02:00:05 +02002951 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002952 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002953 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002954static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002955unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002956 wchar_t *w,
2957 Py_ssize_t size)
2958{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002959 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002960 const wchar_t *wstr;
2961
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002962 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002963 if (wstr == NULL)
2964 return -1;
2965
Victor Stinner5593d8a2010-10-02 11:11:27 +00002966 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002967 if (size > res)
2968 size = res + 1;
2969 else
2970 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002971 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002972 return res;
2973 }
2974 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002975 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002976}
2977
2978Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002979PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002980 wchar_t *w,
2981 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982{
2983 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002984 PyErr_BadInternalCall();
2985 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002987 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988}
2989
Victor Stinner137c34c2010-09-29 10:25:54 +00002990wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002991PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002992 Py_ssize_t *size)
2993{
2994 wchar_t* buffer;
2995 Py_ssize_t buflen;
2996
2997 if (unicode == NULL) {
2998 PyErr_BadInternalCall();
2999 return NULL;
3000 }
3001
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003002 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003003 if (buflen == -1)
3004 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003005 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003006 if (buffer == NULL) {
3007 PyErr_NoMemory();
3008 return NULL;
3009 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003010 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003011 if (buflen == -1) {
3012 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003013 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003014 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003015 if (size != NULL)
3016 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003017 return buffer;
3018}
3019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003020#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021
Alexander Belopolsky40018472011-02-26 01:02:56 +00003022PyObject *
3023PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003024{
Victor Stinner8faf8212011-12-08 22:14:11 +01003025 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003026 PyErr_SetString(PyExc_ValueError,
3027 "chr() arg not in range(0x110000)");
3028 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003029 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003030
Victor Stinner985a82a2014-01-03 12:53:47 +01003031 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003032}
3033
Alexander Belopolsky40018472011-02-26 01:02:56 +00003034PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003035PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003037 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003038 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003039 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003040 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003041 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 Py_INCREF(obj);
3043 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003044 }
3045 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003046 /* For a Unicode subtype that's not a Unicode object,
3047 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003048 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003049 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003050 PyErr_Format(PyExc_TypeError,
3051 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003052 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003053 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003054}
3055
Alexander Belopolsky40018472011-02-26 01:02:56 +00003056PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003057PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003058 const char *encoding,
3059 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003060{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003061 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003062 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003063
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003065 PyErr_BadInternalCall();
3066 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003068
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003069 /* Decoding bytes objects is the most common case and should be fast */
3070 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003071 if (PyBytes_GET_SIZE(obj) == 0)
3072 _Py_RETURN_UNICODE_EMPTY();
3073 v = PyUnicode_Decode(
3074 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3075 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003076 return v;
3077 }
3078
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003079 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003080 PyErr_SetString(PyExc_TypeError,
3081 "decoding str is not supported");
3082 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003083 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003084
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003085 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3086 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3087 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003088 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003089 Py_TYPE(obj)->tp_name);
3090 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003091 }
Tim Petersced69f82003-09-16 20:30:58 +00003092
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003093 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003094 PyBuffer_Release(&buffer);
3095 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003097
Serhiy Storchaka05997252013-01-26 12:14:02 +02003098 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003099 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003100 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101}
3102
Victor Stinnerebe17e02016-10-12 13:57:45 +02003103/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3104 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3105 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003106int
3107_Py_normalize_encoding(const char *encoding,
3108 char *lower,
3109 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003111 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003112 char *l;
3113 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003114 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003115
Victor Stinner942889a2016-09-05 15:40:10 -07003116 assert(encoding != NULL);
3117
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003118 e = encoding;
3119 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003120 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003121 punct = 0;
3122 while (1) {
3123 char c = *e;
3124 if (c == 0) {
3125 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003126 }
Victor Stinner942889a2016-09-05 15:40:10 -07003127
3128 if (Py_ISALNUM(c) || c == '.') {
3129 if (punct && l != lower) {
3130 if (l == l_end) {
3131 return 0;
3132 }
3133 *l++ = '_';
3134 }
3135 punct = 0;
3136
3137 if (l == l_end) {
3138 return 0;
3139 }
3140 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003141 }
3142 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003143 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003144 }
Victor Stinner942889a2016-09-05 15:40:10 -07003145
3146 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003147 }
3148 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003149 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003150}
3151
Alexander Belopolsky40018472011-02-26 01:02:56 +00003152PyObject *
3153PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003154 Py_ssize_t size,
3155 const char *encoding,
3156 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003157{
3158 PyObject *buffer = NULL, *unicode;
3159 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003160 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3161
3162 if (encoding == NULL) {
3163 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3164 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003165
Fred Drakee4315f52000-05-09 19:53:39 +00003166 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003167 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3168 char *lower = buflower;
3169
3170 /* Fast paths */
3171 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3172 lower += 3;
3173 if (*lower == '_') {
3174 /* Match "utf8" and "utf_8" */
3175 lower++;
3176 }
3177
3178 if (lower[0] == '8' && lower[1] == 0) {
3179 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3180 }
3181 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3182 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3183 }
3184 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3185 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3186 }
3187 }
3188 else {
3189 if (strcmp(lower, "ascii") == 0
3190 || strcmp(lower, "us_ascii") == 0) {
3191 return PyUnicode_DecodeASCII(s, size, errors);
3192 }
Steve Dowercc16be82016-09-08 10:35:16 -07003193 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003194 else if (strcmp(lower, "mbcs") == 0) {
3195 return PyUnicode_DecodeMBCS(s, size, errors);
3196 }
3197 #endif
3198 else if (strcmp(lower, "latin1") == 0
3199 || strcmp(lower, "latin_1") == 0
3200 || strcmp(lower, "iso_8859_1") == 0
3201 || strcmp(lower, "iso8859_1") == 0) {
3202 return PyUnicode_DecodeLatin1(s, size, errors);
3203 }
3204 }
Victor Stinner37296e82010-06-10 13:36:23 +00003205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206
3207 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003208 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003209 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003210 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003211 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 if (buffer == NULL)
3213 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003214 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 if (unicode == NULL)
3216 goto onError;
3217 if (!PyUnicode_Check(unicode)) {
3218 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003219 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3220 "use codecs.decode() to decode to arbitrary types",
3221 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003222 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 Py_DECREF(unicode);
3224 goto onError;
3225 }
3226 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003227 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003228
Benjamin Peterson29060642009-01-31 22:14:21 +00003229 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003230 Py_XDECREF(buffer);
3231 return NULL;
3232}
3233
Alexander Belopolsky40018472011-02-26 01:02:56 +00003234PyObject *
3235PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003236 const char *encoding,
3237 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003238{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003239 if (!PyUnicode_Check(unicode)) {
3240 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003241 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003242 }
3243
Serhiy Storchaka00939072016-10-27 21:05:49 +03003244 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3245 "PyUnicode_AsDecodedObject() is deprecated; "
3246 "use PyCodec_Decode() to decode from str", 1) < 0)
3247 return NULL;
3248
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003249 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003251
3252 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003253 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003254}
3255
Alexander Belopolsky40018472011-02-26 01:02:56 +00003256PyObject *
3257PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003258 const char *encoding,
3259 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003260{
3261 PyObject *v;
3262
3263 if (!PyUnicode_Check(unicode)) {
3264 PyErr_BadArgument();
3265 goto onError;
3266 }
3267
Serhiy Storchaka00939072016-10-27 21:05:49 +03003268 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3269 "PyUnicode_AsDecodedUnicode() is deprecated; "
3270 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3271 return NULL;
3272
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003273 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003274 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003275
3276 /* Decode via the codec registry */
3277 v = PyCodec_Decode(unicode, encoding, errors);
3278 if (v == NULL)
3279 goto onError;
3280 if (!PyUnicode_Check(v)) {
3281 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003282 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3283 "use codecs.decode() to decode to arbitrary types",
3284 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003285 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003286 Py_DECREF(v);
3287 goto onError;
3288 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003289 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003290
Benjamin Peterson29060642009-01-31 22:14:21 +00003291 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003292 return NULL;
3293}
3294
Alexander Belopolsky40018472011-02-26 01:02:56 +00003295PyObject *
3296PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003297 Py_ssize_t size,
3298 const char *encoding,
3299 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300{
3301 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003302
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 unicode = PyUnicode_FromUnicode(s, size);
3304 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003305 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3307 Py_DECREF(unicode);
3308 return v;
3309}
3310
Alexander Belopolsky40018472011-02-26 01:02:56 +00003311PyObject *
3312PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003313 const char *encoding,
3314 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003315{
3316 PyObject *v;
3317
3318 if (!PyUnicode_Check(unicode)) {
3319 PyErr_BadArgument();
3320 goto onError;
3321 }
3322
Serhiy Storchaka00939072016-10-27 21:05:49 +03003323 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3324 "PyUnicode_AsEncodedObject() is deprecated; "
3325 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3326 "or PyCodec_Encode() for generic encoding", 1) < 0)
3327 return NULL;
3328
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003329 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003330 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003331
3332 /* Encode via the codec registry */
3333 v = PyCodec_Encode(unicode, encoding, errors);
3334 if (v == NULL)
3335 goto onError;
3336 return v;
3337
Benjamin Peterson29060642009-01-31 22:14:21 +00003338 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003339 return NULL;
3340}
3341
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003342static size_t
3343wcstombs_errorpos(const wchar_t *wstr)
3344{
3345 size_t len;
3346#if SIZEOF_WCHAR_T == 2
3347 wchar_t buf[3];
3348#else
3349 wchar_t buf[2];
3350#endif
3351 char outbuf[MB_LEN_MAX];
3352 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003353
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003354#if SIZEOF_WCHAR_T == 2
3355 buf[2] = 0;
3356#else
3357 buf[1] = 0;
3358#endif
3359 start = wstr;
3360 while (*wstr != L'\0')
3361 {
3362 previous = wstr;
3363#if SIZEOF_WCHAR_T == 2
3364 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3365 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3366 {
3367 buf[0] = wstr[0];
3368 buf[1] = wstr[1];
3369 wstr += 2;
3370 }
3371 else {
3372 buf[0] = *wstr;
3373 buf[1] = 0;
3374 wstr++;
3375 }
3376#else
3377 buf[0] = *wstr;
3378 wstr++;
3379#endif
3380 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003381 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003382 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003383 }
3384
3385 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003386 return 0;
3387}
3388
Victor Stinner1b579672011-12-17 05:47:23 +01003389static int
3390locale_error_handler(const char *errors, int *surrogateescape)
3391{
Victor Stinner50149202015-09-22 00:26:54 +02003392 _Py_error_handler error_handler = get_error_handler(errors);
3393 switch (error_handler)
3394 {
3395 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003396 *surrogateescape = 0;
3397 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003398 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003399 *surrogateescape = 1;
3400 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003401 default:
3402 PyErr_Format(PyExc_ValueError,
3403 "only 'strict' and 'surrogateescape' error handlers "
3404 "are supported, not '%s'",
3405 errors);
3406 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003407 }
Victor Stinner1b579672011-12-17 05:47:23 +01003408}
3409
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003410PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003411PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003412{
3413 Py_ssize_t wlen, wlen2;
3414 wchar_t *wstr;
3415 PyObject *bytes = NULL;
3416 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003417 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003418 PyObject *exc;
3419 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003420 int surrogateescape;
3421
3422 if (locale_error_handler(errors, &surrogateescape) < 0)
3423 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003424
3425 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3426 if (wstr == NULL)
3427 return NULL;
3428
3429 wlen2 = wcslen(wstr);
3430 if (wlen2 != wlen) {
3431 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003432 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003433 return NULL;
3434 }
3435
3436 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003437 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003438 char *str;
3439
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003440 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003441 if (str == NULL) {
3442 if (error_pos == (size_t)-1) {
3443 PyErr_NoMemory();
3444 PyMem_Free(wstr);
3445 return NULL;
3446 }
3447 else {
3448 goto encode_error;
3449 }
3450 }
3451 PyMem_Free(wstr);
3452
3453 bytes = PyBytes_FromString(str);
3454 PyMem_Free(str);
3455 }
3456 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003457 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003458 size_t len, len2;
3459
3460 len = wcstombs(NULL, wstr, 0);
3461 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003462 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003463 goto encode_error;
3464 }
3465
3466 bytes = PyBytes_FromStringAndSize(NULL, len);
3467 if (bytes == NULL) {
3468 PyMem_Free(wstr);
3469 return NULL;
3470 }
3471
3472 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3473 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003474 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003475 goto encode_error;
3476 }
3477 PyMem_Free(wstr);
3478 }
3479 return bytes;
3480
3481encode_error:
3482 errmsg = strerror(errno);
3483 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003484
3485 if (error_pos == (size_t)-1)
3486 error_pos = wcstombs_errorpos(wstr);
3487
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003488 PyMem_Free(wstr);
3489 Py_XDECREF(bytes);
3490
Victor Stinner2f197072011-12-17 07:08:30 +01003491 if (errmsg != NULL) {
3492 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003493 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003494 if (wstr != NULL) {
3495 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003496 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003497 } else
3498 errmsg = NULL;
3499 }
3500 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003501 reason = PyUnicode_FromString(
3502 "wcstombs() encountered an unencodable "
3503 "wide character");
3504 if (reason == NULL)
3505 return NULL;
3506
3507 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3508 "locale", unicode,
3509 (Py_ssize_t)error_pos,
3510 (Py_ssize_t)(error_pos+1),
3511 reason);
3512 Py_DECREF(reason);
3513 if (exc != NULL) {
3514 PyCodec_StrictErrors(exc);
3515 Py_XDECREF(exc);
3516 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003517 return NULL;
3518}
3519
Victor Stinnerad158722010-10-27 00:25:46 +00003520PyObject *
3521PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003522{
Steve Dowercc16be82016-09-08 10:35:16 -07003523#if defined(__APPLE__)
3524 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003525#else
Victor Stinner793b5312011-04-27 00:24:21 +02003526 PyInterpreterState *interp = PyThreadState_GET()->interp;
3527 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3528 cannot use it to encode and decode filenames before it is loaded. Load
3529 the Python codec requires to encode at least its own filename. Use the C
3530 version of the locale codec until the codec registry is initialized and
3531 the Python codec is loaded.
3532
3533 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3534 cannot only rely on it: check also interp->fscodec_initialized for
3535 subinterpreters. */
3536 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003537 return PyUnicode_AsEncodedString(unicode,
3538 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003539 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003540 }
3541 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003542 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003543 }
Victor Stinnerad158722010-10-27 00:25:46 +00003544#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003545}
3546
Alexander Belopolsky40018472011-02-26 01:02:56 +00003547PyObject *
3548PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003549 const char *encoding,
3550 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551{
3552 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003553 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003554
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 if (!PyUnicode_Check(unicode)) {
3556 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003557 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 }
Fred Drakee4315f52000-05-09 19:53:39 +00003559
Victor Stinner942889a2016-09-05 15:40:10 -07003560 if (encoding == NULL) {
3561 return _PyUnicode_AsUTF8String(unicode, errors);
3562 }
3563
Fred Drakee4315f52000-05-09 19:53:39 +00003564 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003565 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3566 char *lower = buflower;
3567
3568 /* Fast paths */
3569 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3570 lower += 3;
3571 if (*lower == '_') {
3572 /* Match "utf8" and "utf_8" */
3573 lower++;
3574 }
3575
3576 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003577 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003578 }
3579 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3580 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3581 }
3582 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3583 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3584 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003585 }
Victor Stinner942889a2016-09-05 15:40:10 -07003586 else {
3587 if (strcmp(lower, "ascii") == 0
3588 || strcmp(lower, "us_ascii") == 0) {
3589 return _PyUnicode_AsASCIIString(unicode, errors);
3590 }
Steve Dowercc16be82016-09-08 10:35:16 -07003591#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003592 else if (strcmp(lower, "mbcs") == 0) {
3593 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3594 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003595#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003596 else if (strcmp(lower, "latin1") == 0 ||
3597 strcmp(lower, "latin_1") == 0 ||
3598 strcmp(lower, "iso_8859_1") == 0 ||
3599 strcmp(lower, "iso8859_1") == 0) {
3600 return _PyUnicode_AsLatin1String(unicode, errors);
3601 }
3602 }
Victor Stinner37296e82010-06-10 13:36:23 +00003603 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604
3605 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003606 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003608 return NULL;
3609
3610 /* The normal path */
3611 if (PyBytes_Check(v))
3612 return v;
3613
3614 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003615 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003616 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003617 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003618
3619 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003620 "encoder %s returned bytearray instead of bytes; "
3621 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003622 encoding);
3623 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003624 Py_DECREF(v);
3625 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003626 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003627
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003628 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3629 Py_DECREF(v);
3630 return b;
3631 }
3632
3633 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003634 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3635 "use codecs.encode() to encode to arbitrary types",
3636 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003637 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003638 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003639 return NULL;
3640}
3641
Alexander Belopolsky40018472011-02-26 01:02:56 +00003642PyObject *
3643PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003644 const char *encoding,
3645 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003646{
3647 PyObject *v;
3648
3649 if (!PyUnicode_Check(unicode)) {
3650 PyErr_BadArgument();
3651 goto onError;
3652 }
3653
Serhiy Storchaka00939072016-10-27 21:05:49 +03003654 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3655 "PyUnicode_AsEncodedUnicode() is deprecated; "
3656 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3657 return NULL;
3658
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003659 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003661
3662 /* Encode via the codec registry */
3663 v = PyCodec_Encode(unicode, encoding, errors);
3664 if (v == NULL)
3665 goto onError;
3666 if (!PyUnicode_Check(v)) {
3667 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003668 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3669 "use codecs.encode() to encode to arbitrary types",
3670 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003671 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003672 Py_DECREF(v);
3673 goto onError;
3674 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003675 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003676
Benjamin Peterson29060642009-01-31 22:14:21 +00003677 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678 return NULL;
3679}
3680
Victor Stinner2f197072011-12-17 07:08:30 +01003681static size_t
3682mbstowcs_errorpos(const char *str, size_t len)
3683{
3684#ifdef HAVE_MBRTOWC
3685 const char *start = str;
3686 mbstate_t mbs;
3687 size_t converted;
3688 wchar_t ch;
3689
3690 memset(&mbs, 0, sizeof mbs);
3691 while (len)
3692 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003693 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003694 if (converted == 0)
3695 /* Reached end of string */
3696 break;
3697 if (converted == (size_t)-1 || converted == (size_t)-2) {
3698 /* Conversion error or incomplete character */
3699 return str - start;
3700 }
3701 else {
3702 str += converted;
3703 len -= converted;
3704 }
3705 }
3706 /* failed to find the undecodable byte sequence */
3707 return 0;
3708#endif
3709 return 0;
3710}
3711
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003712PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003713PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003714 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003715{
3716 wchar_t smallbuf[256];
3717 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3718 wchar_t *wstr;
3719 size_t wlen, wlen2;
3720 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003721 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003722 size_t error_pos;
3723 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003724 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3725 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003726
3727 if (locale_error_handler(errors, &surrogateescape) < 0)
3728 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003729
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003730 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3731 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003732 return NULL;
3733 }
3734
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003735 if (surrogateescape) {
3736 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003737 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003738 if (wstr == NULL) {
3739 if (wlen == (size_t)-1)
3740 PyErr_NoMemory();
3741 else
3742 PyErr_SetFromErrno(PyExc_OSError);
3743 return NULL;
3744 }
3745
3746 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003747 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003748 }
3749 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003750 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003751#ifndef HAVE_BROKEN_MBSTOWCS
3752 wlen = mbstowcs(NULL, str, 0);
3753#else
3754 wlen = len;
3755#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003756 if (wlen == (size_t)-1)
3757 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003758 if (wlen+1 <= smallbuf_len) {
3759 wstr = smallbuf;
3760 }
3761 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003762 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003763 if (!wstr)
3764 return PyErr_NoMemory();
3765 }
3766
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003767 wlen2 = mbstowcs(wstr, str, wlen+1);
3768 if (wlen2 == (size_t)-1) {
3769 if (wstr != smallbuf)
3770 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003771 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003772 }
3773#ifdef HAVE_BROKEN_MBSTOWCS
3774 assert(wlen2 == wlen);
3775#endif
3776 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3777 if (wstr != smallbuf)
3778 PyMem_Free(wstr);
3779 }
3780 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003781
3782decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003783 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003784 errmsg = strerror(errno);
3785 assert(errmsg != NULL);
3786
3787 error_pos = mbstowcs_errorpos(str, len);
3788 if (errmsg != NULL) {
3789 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003790 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003791 if (wstr != NULL) {
3792 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003793 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003794 }
Victor Stinner2f197072011-12-17 07:08:30 +01003795 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003796 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003797 reason = PyUnicode_FromString(
3798 "mbstowcs() encountered an invalid multibyte sequence");
3799 if (reason == NULL)
3800 return NULL;
3801
3802 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3803 "locale", str, len,
3804 (Py_ssize_t)error_pos,
3805 (Py_ssize_t)(error_pos+1),
3806 reason);
3807 Py_DECREF(reason);
3808 if (exc != NULL) {
3809 PyCodec_StrictErrors(exc);
3810 Py_XDECREF(exc);
3811 }
3812 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003813}
3814
3815PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003816PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003817{
3818 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003819 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003820}
3821
3822
3823PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003824PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003825 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003826 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3827}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003828
Christian Heimes5894ba72007-11-04 11:43:14 +00003829PyObject*
3830PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3831{
Steve Dowercc16be82016-09-08 10:35:16 -07003832#if defined(__APPLE__)
3833 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003834#else
Victor Stinner793b5312011-04-27 00:24:21 +02003835 PyInterpreterState *interp = PyThreadState_GET()->interp;
3836 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3837 cannot use it to encode and decode filenames before it is loaded. Load
3838 the Python codec requires to encode at least its own filename. Use the C
3839 version of the locale codec until the codec registry is initialized and
3840 the Python codec is loaded.
3841
3842 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3843 cannot only rely on it: check also interp->fscodec_initialized for
3844 subinterpreters. */
3845 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dowercc16be82016-09-08 10:35:16 -07003846 PyObject *res = PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003847 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003848 Py_FileSystemDefaultEncodeErrors);
3849#ifdef MS_WINDOWS
3850 if (!res && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
Serhiy Storchaka467ab192016-10-21 17:09:17 +03003851 _PyErr_FormatFromCause(PyExc_RuntimeError,
3852 "filesystem path bytes were not correctly encoded with '%s'. "
Steve Dowercc16be82016-09-08 10:35:16 -07003853 "Please report this at http://bugs.python.org/issue27781",
3854 Py_FileSystemDefaultEncoding);
Steve Dowercc16be82016-09-08 10:35:16 -07003855 }
3856#endif
3857 return res;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003858 }
3859 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003860 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003861 }
Victor Stinnerad158722010-10-27 00:25:46 +00003862#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003863}
3864
Martin v. Löwis011e8422009-05-05 04:43:17 +00003865
3866int
3867PyUnicode_FSConverter(PyObject* arg, void* addr)
3868{
Brett Cannonec6ce872016-09-06 15:50:29 -07003869 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003870 PyObject *output = NULL;
3871 Py_ssize_t size;
3872 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003873 if (arg == NULL) {
3874 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003875 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003876 return 1;
3877 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003878 path = PyOS_FSPath(arg);
3879 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003880 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003881 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003882 if (PyBytes_Check(path)) {
3883 output = path;
3884 }
3885 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3886 output = PyUnicode_EncodeFSDefault(path);
3887 Py_DECREF(path);
3888 if (!output) {
3889 return 0;
3890 }
3891 assert(PyBytes_Check(output));
3892 }
3893
Victor Stinner0ea2a462010-04-30 00:22:08 +00003894 size = PyBytes_GET_SIZE(output);
3895 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003896 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003897 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003898 Py_DECREF(output);
3899 return 0;
3900 }
3901 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003902 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003903}
3904
3905
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003906int
3907PyUnicode_FSDecoder(PyObject* arg, void* addr)
3908{
Brett Cannona5711202016-09-06 19:36:01 -07003909 int is_buffer = 0;
3910 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003911 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003912 if (arg == NULL) {
3913 Py_DECREF(*(PyObject**)addr);
3914 return 1;
3915 }
Brett Cannona5711202016-09-06 19:36:01 -07003916
3917 is_buffer = PyObject_CheckBuffer(arg);
3918 if (!is_buffer) {
3919 path = PyOS_FSPath(arg);
3920 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003921 return 0;
3922 }
Brett Cannona5711202016-09-06 19:36:01 -07003923 }
3924 else {
3925 path = arg;
3926 Py_INCREF(arg);
3927 }
3928
3929 if (PyUnicode_Check(path)) {
3930 if (PyUnicode_READY(path) == -1) {
3931 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003932 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003933 }
3934 output = path;
3935 }
3936 else if (PyBytes_Check(path) || is_buffer) {
3937 PyObject *path_bytes = NULL;
3938
3939 if (!PyBytes_Check(path) &&
3940 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3941 "path should be string, bytes, or os.PathLike, not %.200s",
3942 Py_TYPE(arg)->tp_name)) {
3943 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003944 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003945 }
3946 path_bytes = PyBytes_FromObject(path);
3947 Py_DECREF(path);
3948 if (!path_bytes) {
3949 return 0;
3950 }
3951 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3952 PyBytes_GET_SIZE(path_bytes));
3953 Py_DECREF(path_bytes);
3954 if (!output) {
3955 return 0;
3956 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003957 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003958 else {
3959 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003960 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003961 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003962 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003963 return 0;
3964 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003965 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003966 Py_DECREF(output);
3967 return 0;
3968 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003969 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003970 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003971 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003972 Py_DECREF(output);
3973 return 0;
3974 }
3975 *(PyObject**)addr = output;
3976 return Py_CLEANUP_SUPPORTED;
3977}
3978
3979
Martin v. Löwis5b222132007-06-10 09:51:05 +00003980char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003982{
Christian Heimesf3863112007-11-22 07:46:41 +00003983 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003984
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003985 if (!PyUnicode_Check(unicode)) {
3986 PyErr_BadArgument();
3987 return NULL;
3988 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003989 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003990 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003991
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003992 if (PyUnicode_UTF8(unicode) == NULL) {
3993 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003994 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995 if (bytes == NULL)
3996 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003997 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3998 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003999 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004000 Py_DECREF(bytes);
4001 return NULL;
4002 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004003 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02004004 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004005 PyBytes_AS_STRING(bytes),
4006 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 Py_DECREF(bytes);
4008 }
4009
4010 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004011 *psize = PyUnicode_UTF8_LENGTH(unicode);
4012 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004013}
4014
4015char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4019}
4020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004021Py_UNICODE *
4022PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004024 const unsigned char *one_byte;
4025#if SIZEOF_WCHAR_T == 4
4026 const Py_UCS2 *two_bytes;
4027#else
4028 const Py_UCS4 *four_bytes;
4029 const Py_UCS4 *ucs4_end;
4030 Py_ssize_t num_surrogates;
4031#endif
4032 wchar_t *w;
4033 wchar_t *wchar_end;
4034
4035 if (!PyUnicode_Check(unicode)) {
4036 PyErr_BadArgument();
4037 return NULL;
4038 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004039 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004041 assert(_PyUnicode_KIND(unicode) != 0);
4042 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004043
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004044 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004046 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4047 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048 num_surrogates = 0;
4049
4050 for (; four_bytes < ucs4_end; ++four_bytes) {
4051 if (*four_bytes > 0xFFFF)
4052 ++num_surrogates;
4053 }
4054
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004055 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4056 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4057 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004058 PyErr_NoMemory();
4059 return NULL;
4060 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004061 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004063 w = _PyUnicode_WSTR(unicode);
4064 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4065 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004066 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4067 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004068 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004070 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4071 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004072 }
4073 else
4074 *w = *four_bytes;
4075
4076 if (w > wchar_end) {
4077 assert(0 && "Miscalculated string end");
4078 }
4079 }
4080 *w = 0;
4081#else
4082 /* sizeof(wchar_t) == 4 */
4083 Py_FatalError("Impossible unicode object state, wstr and str "
4084 "should share memory already.");
4085 return NULL;
4086#endif
4087 }
4088 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004089 if ((size_t)_PyUnicode_LENGTH(unicode) >
4090 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4091 PyErr_NoMemory();
4092 return NULL;
4093 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004094 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4095 (_PyUnicode_LENGTH(unicode) + 1));
4096 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097 PyErr_NoMemory();
4098 return NULL;
4099 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004100 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4101 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4102 w = _PyUnicode_WSTR(unicode);
4103 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004104
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004105 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4106 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004107 for (; w < wchar_end; ++one_byte, ++w)
4108 *w = *one_byte;
4109 /* null-terminate the wstr */
4110 *w = 0;
4111 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004112 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004113#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004114 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004115 for (; w < wchar_end; ++two_bytes, ++w)
4116 *w = *two_bytes;
4117 /* null-terminate the wstr */
4118 *w = 0;
4119#else
4120 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004121 PyObject_FREE(_PyUnicode_WSTR(unicode));
4122 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004123 Py_FatalError("Impossible unicode object state, wstr "
4124 "and str should share memory already.");
4125 return NULL;
4126#endif
4127 }
4128 else {
4129 assert(0 && "This should never happen.");
4130 }
4131 }
4132 }
4133 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004134 *size = PyUnicode_WSTR_LENGTH(unicode);
4135 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004136}
4137
Alexander Belopolsky40018472011-02-26 01:02:56 +00004138Py_UNICODE *
4139PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004141 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142}
4143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004144
Alexander Belopolsky40018472011-02-26 01:02:56 +00004145Py_ssize_t
4146PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147{
4148 if (!PyUnicode_Check(unicode)) {
4149 PyErr_BadArgument();
4150 goto onError;
4151 }
4152 return PyUnicode_GET_SIZE(unicode);
4153
Benjamin Peterson29060642009-01-31 22:14:21 +00004154 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155 return -1;
4156}
4157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004158Py_ssize_t
4159PyUnicode_GetLength(PyObject *unicode)
4160{
Victor Stinner07621332012-06-16 04:53:46 +02004161 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004162 PyErr_BadArgument();
4163 return -1;
4164 }
Victor Stinner07621332012-06-16 04:53:46 +02004165 if (PyUnicode_READY(unicode) == -1)
4166 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004167 return PyUnicode_GET_LENGTH(unicode);
4168}
4169
4170Py_UCS4
4171PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4172{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004173 void *data;
4174 int kind;
4175
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004176 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4177 PyErr_BadArgument();
4178 return (Py_UCS4)-1;
4179 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004180 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004181 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004182 return (Py_UCS4)-1;
4183 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004184 data = PyUnicode_DATA(unicode);
4185 kind = PyUnicode_KIND(unicode);
4186 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004187}
4188
4189int
4190PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4191{
4192 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004193 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004194 return -1;
4195 }
Victor Stinner488fa492011-12-12 00:01:39 +01004196 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004197 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004198 PyErr_SetString(PyExc_IndexError, "string index out of range");
4199 return -1;
4200 }
Victor Stinner488fa492011-12-12 00:01:39 +01004201 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004202 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004203 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4204 PyErr_SetString(PyExc_ValueError, "character out of range");
4205 return -1;
4206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004207 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4208 index, ch);
4209 return 0;
4210}
4211
Alexander Belopolsky40018472011-02-26 01:02:56 +00004212const char *
4213PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004214{
Victor Stinner42cb4622010-09-01 19:39:01 +00004215 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004216}
4217
Victor Stinner554f3f02010-06-16 23:33:54 +00004218/* create or adjust a UnicodeDecodeError */
4219static void
4220make_decode_exception(PyObject **exceptionObject,
4221 const char *encoding,
4222 const char *input, Py_ssize_t length,
4223 Py_ssize_t startpos, Py_ssize_t endpos,
4224 const char *reason)
4225{
4226 if (*exceptionObject == NULL) {
4227 *exceptionObject = PyUnicodeDecodeError_Create(
4228 encoding, input, length, startpos, endpos, reason);
4229 }
4230 else {
4231 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4232 goto onError;
4233 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4234 goto onError;
4235 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4236 goto onError;
4237 }
4238 return;
4239
4240onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004241 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004242}
4243
Steve Dowercc16be82016-09-08 10:35:16 -07004244#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245/* error handling callback helper:
4246 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004247 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248 and adjust various state variables.
4249 return 0 on success, -1 on error
4250*/
4251
Alexander Belopolsky40018472011-02-26 01:02:56 +00004252static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004253unicode_decode_call_errorhandler_wchar(
4254 const char *errors, PyObject **errorHandler,
4255 const char *encoding, const char *reason,
4256 const char **input, const char **inend, Py_ssize_t *startinpos,
4257 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4258 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004260 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261
4262 PyObject *restuple = NULL;
4263 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004264 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004265 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004266 Py_ssize_t requiredsize;
4267 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004268 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004269 wchar_t *repwstr;
4270 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004272 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4273 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004274
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004275 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 *errorHandler = PyCodec_LookupError(errors);
4277 if (*errorHandler == NULL)
4278 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279 }
4280
Victor Stinner554f3f02010-06-16 23:33:54 +00004281 make_decode_exception(exceptionObject,
4282 encoding,
4283 *input, *inend - *input,
4284 *startinpos, *endinpos,
4285 reason);
4286 if (*exceptionObject == NULL)
4287 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288
4289 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4290 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004291 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004292 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004293 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 }
4296 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004297 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004298
4299 /* Copy back the bytes variables, which might have been modified by the
4300 callback */
4301 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4302 if (!inputobj)
4303 goto onError;
4304 if (!PyBytes_Check(inputobj)) {
4305 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4306 }
4307 *input = PyBytes_AS_STRING(inputobj);
4308 insize = PyBytes_GET_SIZE(inputobj);
4309 *inend = *input + insize;
4310 /* we can DECREF safely, as the exception has another reference,
4311 so the object won't go away. */
4312 Py_DECREF(inputobj);
4313
4314 if (newpos<0)
4315 newpos = insize+newpos;
4316 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004317 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004318 goto onError;
4319 }
4320
4321 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4322 if (repwstr == NULL)
4323 goto onError;
4324 /* need more space? (at least enough for what we
4325 have+the replacement+the rest of the string (starting
4326 at the new input position), so we won't have to check space
4327 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004328 requiredsize = *outpos;
4329 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4330 goto overflow;
4331 requiredsize += repwlen;
4332 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4333 goto overflow;
4334 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004335 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004336 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004337 requiredsize = 2*outsize;
4338 if (unicode_resize(output, requiredsize) < 0)
4339 goto onError;
4340 }
4341 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4342 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004343 *endinpos = newpos;
4344 *inptr = *input + newpos;
4345
4346 /* we made it! */
4347 Py_XDECREF(restuple);
4348 return 0;
4349
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004350 overflow:
4351 PyErr_SetString(PyExc_OverflowError,
4352 "decoded result is too long for a Python string");
4353
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004354 onError:
4355 Py_XDECREF(restuple);
4356 return -1;
4357}
Steve Dowercc16be82016-09-08 10:35:16 -07004358#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004359
4360static int
4361unicode_decode_call_errorhandler_writer(
4362 const char *errors, PyObject **errorHandler,
4363 const char *encoding, const char *reason,
4364 const char **input, const char **inend, Py_ssize_t *startinpos,
4365 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4366 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4367{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004368 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004369
4370 PyObject *restuple = NULL;
4371 PyObject *repunicode = NULL;
4372 Py_ssize_t insize;
4373 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004374 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004375 PyObject *inputobj = NULL;
4376
4377 if (*errorHandler == NULL) {
4378 *errorHandler = PyCodec_LookupError(errors);
4379 if (*errorHandler == NULL)
4380 goto onError;
4381 }
4382
4383 make_decode_exception(exceptionObject,
4384 encoding,
4385 *input, *inend - *input,
4386 *startinpos, *endinpos,
4387 reason);
4388 if (*exceptionObject == NULL)
4389 goto onError;
4390
4391 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4392 if (restuple == NULL)
4393 goto onError;
4394 if (!PyTuple_Check(restuple)) {
4395 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4396 goto onError;
4397 }
4398 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004399 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004400
4401 /* Copy back the bytes variables, which might have been modified by the
4402 callback */
4403 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4404 if (!inputobj)
4405 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004406 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004408 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004409 *input = PyBytes_AS_STRING(inputobj);
4410 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004411 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004412 /* we can DECREF safely, as the exception has another reference,
4413 so the object won't go away. */
4414 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004415
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004417 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004418 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004419 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004420 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004421 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422
Victor Stinner8f674cc2013-04-17 23:02:17 +02004423 if (PyUnicode_READY(repunicode) < 0)
4424 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004425 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004426 if (replen > 1) {
4427 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004428 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004429 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4430 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4431 goto onError;
4432 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004433 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004434 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004437 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004438
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004440 Py_XDECREF(restuple);
4441 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442
Benjamin Peterson29060642009-01-31 22:14:21 +00004443 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004445 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446}
4447
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004448/* --- UTF-7 Codec -------------------------------------------------------- */
4449
Antoine Pitrou244651a2009-05-04 18:56:13 +00004450/* See RFC2152 for details. We encode conservatively and decode liberally. */
4451
4452/* Three simple macros defining base-64. */
4453
4454/* Is c a base-64 character? */
4455
4456#define IS_BASE64(c) \
4457 (((c) >= 'A' && (c) <= 'Z') || \
4458 ((c) >= 'a' && (c) <= 'z') || \
4459 ((c) >= '0' && (c) <= '9') || \
4460 (c) == '+' || (c) == '/')
4461
4462/* given that c is a base-64 character, what is its base-64 value? */
4463
4464#define FROM_BASE64(c) \
4465 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4466 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4467 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4468 (c) == '+' ? 62 : 63)
4469
4470/* What is the base-64 character of the bottom 6 bits of n? */
4471
4472#define TO_BASE64(n) \
4473 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4474
4475/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4476 * decoded as itself. We are permissive on decoding; the only ASCII
4477 * byte not decoding to itself is the + which begins a base64
4478 * string. */
4479
4480#define DECODE_DIRECT(c) \
4481 ((c) <= 127 && (c) != '+')
4482
4483/* The UTF-7 encoder treats ASCII characters differently according to
4484 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4485 * the above). See RFC2152. This array identifies these different
4486 * sets:
4487 * 0 : "Set D"
4488 * alphanumeric and '(),-./:?
4489 * 1 : "Set O"
4490 * !"#$%&*;<=>@[]^_`{|}
4491 * 2 : "whitespace"
4492 * ht nl cr sp
4493 * 3 : special (must be base64 encoded)
4494 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4495 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004496
Tim Petersced69f82003-09-16 20:30:58 +00004497static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004498char utf7_category[128] = {
4499/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4500 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4501/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4502 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4503/* sp ! " # $ % & ' ( ) * + , - . / */
4504 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4505/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4506 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4507/* @ A B C D E F G H I J K L M N O */
4508 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4509/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4510 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4511/* ` a b c d e f g h i j k l m n o */
4512 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4513/* p q r s t u v w x y z { | } ~ del */
4514 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004515};
4516
Antoine Pitrou244651a2009-05-04 18:56:13 +00004517/* ENCODE_DIRECT: this character should be encoded as itself. The
4518 * answer depends on whether we are encoding set O as itself, and also
4519 * on whether we are encoding whitespace as itself. RFC2152 makes it
4520 * clear that the answers to these questions vary between
4521 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004522
Antoine Pitrou244651a2009-05-04 18:56:13 +00004523#define ENCODE_DIRECT(c, directO, directWS) \
4524 ((c) < 128 && (c) > 0 && \
4525 ((utf7_category[(c)] == 0) || \
4526 (directWS && (utf7_category[(c)] == 2)) || \
4527 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528
Alexander Belopolsky40018472011-02-26 01:02:56 +00004529PyObject *
4530PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004531 Py_ssize_t size,
4532 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004533{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004534 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4535}
4536
Antoine Pitrou244651a2009-05-04 18:56:13 +00004537/* The decoder. The only state we preserve is our read position,
4538 * i.e. how many characters we have consumed. So if we end in the
4539 * middle of a shift sequence we have to back off the read position
4540 * and the output to the beginning of the sequence, otherwise we lose
4541 * all the shift state (seen bits, number of bits seen, high
4542 * surrogate). */
4543
Alexander Belopolsky40018472011-02-26 01:02:56 +00004544PyObject *
4545PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004546 Py_ssize_t size,
4547 const char *errors,
4548 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004549{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004551 Py_ssize_t startinpos;
4552 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004553 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004554 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 const char *errmsg = "";
4556 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004557 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004558 unsigned int base64bits = 0;
4559 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004560 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 PyObject *errorHandler = NULL;
4562 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004563
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004564 if (size == 0) {
4565 if (consumed)
4566 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004567 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004568 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004569
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004570 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004571 _PyUnicodeWriter_Init(&writer);
4572 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004573
4574 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004575 e = s + size;
4576
4577 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004578 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004579 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004580 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004581
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 if (inShift) { /* in a base-64 section */
4583 if (IS_BASE64(ch)) { /* consume a base-64 character */
4584 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4585 base64bits += 6;
4586 s++;
4587 if (base64bits >= 16) {
4588 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004589 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 base64bits -= 16;
4591 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004592 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 if (surrogate) {
4594 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004595 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4596 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004597 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004598 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004599 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004600 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004601 }
4602 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004603 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004604 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004605 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 }
4607 }
Victor Stinner551ac952011-11-29 22:58:13 +01004608 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609 /* first surrogate */
4610 surrogate = outCh;
4611 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004613 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004614 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 }
4616 }
4617 }
4618 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004619 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004620 if (base64bits > 0) { /* left-over bits */
4621 if (base64bits >= 6) {
4622 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004623 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624 errmsg = "partial character in shift sequence";
4625 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004626 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004627 else {
4628 /* Some bits remain; they should be zero */
4629 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004630 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004631 errmsg = "non-zero padding bits in shift sequence";
4632 goto utf7Error;
4633 }
4634 }
4635 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004636 if (surrogate && DECODE_DIRECT(ch)) {
4637 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4638 goto onError;
4639 }
4640 surrogate = 0;
4641 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004642 /* '-' is absorbed; other terminating
4643 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004644 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004645 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004646 }
4647 }
4648 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004650 s++; /* consume '+' */
4651 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004652 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004653 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004654 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004655 }
4656 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004657 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004658 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004659 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004660 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004661 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004662 }
4663 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004665 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004666 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004667 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004668 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004669 else {
4670 startinpos = s-starts;
4671 s++;
4672 errmsg = "unexpected special character";
4673 goto utf7Error;
4674 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004675 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004676utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004677 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004678 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004679 errors, &errorHandler,
4680 "utf7", errmsg,
4681 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004682 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004683 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004684 }
4685
Antoine Pitrou244651a2009-05-04 18:56:13 +00004686 /* end of string */
4687
4688 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4689 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004690 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004691 if (surrogate ||
4692 (base64bits >= 6) ||
4693 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004694 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004695 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004696 errors, &errorHandler,
4697 "utf7", "unterminated shift sequence",
4698 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004699 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004700 goto onError;
4701 if (s < e)
4702 goto restart;
4703 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004704 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004705
4706 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004707 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004708 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004709 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004710 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004711 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004712 writer.kind, writer.data, shiftOutStart);
4713 Py_XDECREF(errorHandler);
4714 Py_XDECREF(exc);
4715 _PyUnicodeWriter_Dealloc(&writer);
4716 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004717 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004718 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004719 }
4720 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004721 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004722 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004723 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004724
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004725 Py_XDECREF(errorHandler);
4726 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004727 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004728
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004730 Py_XDECREF(errorHandler);
4731 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004732 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004733 return NULL;
4734}
4735
4736
Alexander Belopolsky40018472011-02-26 01:02:56 +00004737PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004738_PyUnicode_EncodeUTF7(PyObject *str,
4739 int base64SetO,
4740 int base64WhiteSpace,
4741 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004742{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004743 int kind;
4744 void *data;
4745 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004746 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004747 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004748 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004749 unsigned int base64bits = 0;
4750 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004751 char * out;
4752 char * start;
4753
Benjamin Petersonbac79492012-01-14 13:34:47 -05004754 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004755 return NULL;
4756 kind = PyUnicode_KIND(str);
4757 data = PyUnicode_DATA(str);
4758 len = PyUnicode_GET_LENGTH(str);
4759
4760 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004761 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004762
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004763 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004764 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004765 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004766 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004767 if (v == NULL)
4768 return NULL;
4769
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004770 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004771 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004772 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004773
Antoine Pitrou244651a2009-05-04 18:56:13 +00004774 if (inShift) {
4775 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4776 /* shifting out */
4777 if (base64bits) { /* output remaining bits */
4778 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4779 base64buffer = 0;
4780 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004781 }
4782 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004783 /* Characters not in the BASE64 set implicitly unshift the sequence
4784 so no '-' is required, except if the character is itself a '-' */
4785 if (IS_BASE64(ch) || ch == '-') {
4786 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004787 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004788 *out++ = (char) ch;
4789 }
4790 else {
4791 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004792 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004793 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004794 else { /* not in a shift sequence */
4795 if (ch == '+') {
4796 *out++ = '+';
4797 *out++ = '-';
4798 }
4799 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4800 *out++ = (char) ch;
4801 }
4802 else {
4803 *out++ = '+';
4804 inShift = 1;
4805 goto encode_char;
4806 }
4807 }
4808 continue;
4809encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004810 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004811 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004812
Antoine Pitrou244651a2009-05-04 18:56:13 +00004813 /* code first surrogate */
4814 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004815 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004816 while (base64bits >= 6) {
4817 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4818 base64bits -= 6;
4819 }
4820 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004821 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004822 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004823 base64bits += 16;
4824 base64buffer = (base64buffer << 16) | ch;
4825 while (base64bits >= 6) {
4826 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4827 base64bits -= 6;
4828 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004829 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004830 if (base64bits)
4831 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4832 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004833 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004834 if (_PyBytes_Resize(&v, out - start) < 0)
4835 return NULL;
4836 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004837}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004838PyObject *
4839PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4840 Py_ssize_t size,
4841 int base64SetO,
4842 int base64WhiteSpace,
4843 const char *errors)
4844{
4845 PyObject *result;
4846 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4847 if (tmp == NULL)
4848 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004849 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004850 base64WhiteSpace, errors);
4851 Py_DECREF(tmp);
4852 return result;
4853}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004854
Antoine Pitrou244651a2009-05-04 18:56:13 +00004855#undef IS_BASE64
4856#undef FROM_BASE64
4857#undef TO_BASE64
4858#undef DECODE_DIRECT
4859#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004860
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861/* --- UTF-8 Codec -------------------------------------------------------- */
4862
Alexander Belopolsky40018472011-02-26 01:02:56 +00004863PyObject *
4864PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004865 Py_ssize_t size,
4866 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867{
Walter Dörwald69652032004-09-07 20:24:22 +00004868 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4869}
4870
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004871#include "stringlib/asciilib.h"
4872#include "stringlib/codecs.h"
4873#include "stringlib/undef.h"
4874
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004875#include "stringlib/ucs1lib.h"
4876#include "stringlib/codecs.h"
4877#include "stringlib/undef.h"
4878
4879#include "stringlib/ucs2lib.h"
4880#include "stringlib/codecs.h"
4881#include "stringlib/undef.h"
4882
4883#include "stringlib/ucs4lib.h"
4884#include "stringlib/codecs.h"
4885#include "stringlib/undef.h"
4886
Antoine Pitrouab868312009-01-10 15:40:25 +00004887/* Mask to quickly check whether a C 'long' contains a
4888 non-ASCII, UTF8-encoded char. */
4889#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004890# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004891#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004892# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004893#else
4894# error C 'long' size should be either 4 or 8!
4895#endif
4896
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004897static Py_ssize_t
4898ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004899{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004900 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004901 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004902
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004903 /*
4904 * Issue #17237: m68k is a bit different from most architectures in
4905 * that objects do not use "natural alignment" - for example, int and
4906 * long are only aligned at 2-byte boundaries. Therefore the assert()
4907 * won't work; also, tests have shown that skipping the "optimised
4908 * version" will even speed up m68k.
4909 */
4910#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004912 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4913 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914 /* Fast path, see in STRINGLIB(utf8_decode) for
4915 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004916 /* Help allocation */
4917 const char *_p = p;
4918 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004919 while (_p < aligned_end) {
4920 unsigned long value = *(const unsigned long *) _p;
4921 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004923 *((unsigned long *)q) = value;
4924 _p += SIZEOF_LONG;
4925 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004926 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004927 p = _p;
4928 while (p < end) {
4929 if ((unsigned char)*p & 0x80)
4930 break;
4931 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004933 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004935#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004936#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004937 while (p < end) {
4938 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4939 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004940 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004941 /* Help allocation */
4942 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004943 while (_p < aligned_end) {
4944 unsigned long value = *(unsigned long *) _p;
4945 if (value & ASCII_CHAR_MASK)
4946 break;
4947 _p += SIZEOF_LONG;
4948 }
4949 p = _p;
4950 if (_p == end)
4951 break;
4952 }
4953 if ((unsigned char)*p & 0x80)
4954 break;
4955 ++p;
4956 }
4957 memcpy(dest, start, p - start);
4958 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959}
Antoine Pitrouab868312009-01-10 15:40:25 +00004960
Victor Stinner785938e2011-12-11 20:09:03 +01004961PyObject *
4962PyUnicode_DecodeUTF8Stateful(const char *s,
4963 Py_ssize_t size,
4964 const char *errors,
4965 Py_ssize_t *consumed)
4966{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004967 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004968 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004969 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004970
4971 Py_ssize_t startinpos;
4972 Py_ssize_t endinpos;
4973 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004974 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004975 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004976 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004977
4978 if (size == 0) {
4979 if (consumed)
4980 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004981 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004982 }
4983
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004984 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4985 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004986 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004987 *consumed = 1;
4988 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004989 }
4990
Victor Stinner8f674cc2013-04-17 23:02:17 +02004991 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004992 writer.min_length = size;
4993 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004994 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004995
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004996 writer.pos = ascii_decode(s, end, writer.data);
4997 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004998 while (s < end) {
4999 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005000 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005001
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005002 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005003 if (PyUnicode_IS_ASCII(writer.buffer))
5004 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005006 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005007 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005008 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 } else {
5010 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005011 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012 }
5013
5014 switch (ch) {
5015 case 0:
5016 if (s == end || consumed)
5017 goto End;
5018 errmsg = "unexpected end of data";
5019 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005020 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005021 break;
5022 case 1:
5023 errmsg = "invalid start byte";
5024 startinpos = s - starts;
5025 endinpos = startinpos + 1;
5026 break;
5027 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005028 case 3:
5029 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005030 errmsg = "invalid continuation byte";
5031 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005032 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005033 break;
5034 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005035 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005036 goto onError;
5037 continue;
5038 }
5039
Victor Stinner1d65d912015-10-05 13:43:50 +02005040 if (error_handler == _Py_ERROR_UNKNOWN)
5041 error_handler = get_error_handler(errors);
5042
5043 switch (error_handler) {
5044 case _Py_ERROR_IGNORE:
5045 s += (endinpos - startinpos);
5046 break;
5047
5048 case _Py_ERROR_REPLACE:
5049 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5050 goto onError;
5051 s += (endinpos - startinpos);
5052 break;
5053
5054 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005055 {
5056 Py_ssize_t i;
5057
Victor Stinner1d65d912015-10-05 13:43:50 +02005058 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5059 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005060 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005061 ch = (Py_UCS4)(unsigned char)(starts[i]);
5062 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5063 ch + 0xdc00);
5064 writer.pos++;
5065 }
5066 s += (endinpos - startinpos);
5067 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005068 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005069
5070 default:
5071 if (unicode_decode_call_errorhandler_writer(
5072 errors, &error_handler_obj,
5073 "utf-8", errmsg,
5074 &starts, &end, &startinpos, &endinpos, &exc, &s,
5075 &writer))
5076 goto onError;
5077 }
Victor Stinner785938e2011-12-11 20:09:03 +01005078 }
5079
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005080End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 if (consumed)
5082 *consumed = s - starts;
5083
Victor Stinner1d65d912015-10-05 13:43:50 +02005084 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005085 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005086 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005087
5088onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005089 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005090 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005091 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005092 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005093}
5094
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005095#ifdef __APPLE__
5096
5097/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005098 used to decode the command line arguments on Mac OS X.
5099
5100 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005101 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005102
5103wchar_t*
5104_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5105{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005106 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005107 wchar_t *unicode;
5108 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005109
5110 /* Note: size will always be longer than the resulting Unicode
5111 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005112 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005113 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005114 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005115 if (!unicode)
5116 return NULL;
5117
5118 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005119 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005120 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005121 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005122 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005123#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005124 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005125#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005126 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005127#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005128 if (ch > 0xFF) {
5129#if SIZEOF_WCHAR_T == 4
5130 assert(0);
5131#else
5132 assert(Py_UNICODE_IS_SURROGATE(ch));
5133 /* compute and append the two surrogates: */
5134 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5135 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5136#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005137 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005138 else {
5139 if (!ch && s == e)
5140 break;
5141 /* surrogateescape */
5142 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5143 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005144 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005145 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005146 return unicode;
5147}
5148
5149#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005151/* Primary internal function which creates utf8 encoded bytes objects.
5152
5153 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005154 and allocate exactly as much space needed at the end. Else allocate the
5155 maximum possible needed (4 result bytes per Unicode character), and return
5156 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005157*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005158PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005159_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160{
Victor Stinner6099a032011-12-18 14:22:26 +01005161 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005162 void *data;
5163 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005165 if (!PyUnicode_Check(unicode)) {
5166 PyErr_BadArgument();
5167 return NULL;
5168 }
5169
5170 if (PyUnicode_READY(unicode) == -1)
5171 return NULL;
5172
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005173 if (PyUnicode_UTF8(unicode))
5174 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5175 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005176
5177 kind = PyUnicode_KIND(unicode);
5178 data = PyUnicode_DATA(unicode);
5179 size = PyUnicode_GET_LENGTH(unicode);
5180
Benjamin Petersonead6b532011-12-20 17:23:42 -06005181 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005182 default:
5183 assert(0);
5184 case PyUnicode_1BYTE_KIND:
5185 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5186 assert(!PyUnicode_IS_ASCII(unicode));
5187 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5188 case PyUnicode_2BYTE_KIND:
5189 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5190 case PyUnicode_4BYTE_KIND:
5191 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005192 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193}
5194
Alexander Belopolsky40018472011-02-26 01:02:56 +00005195PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005196PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5197 Py_ssize_t size,
5198 const char *errors)
5199{
5200 PyObject *v, *unicode;
5201
5202 unicode = PyUnicode_FromUnicode(s, size);
5203 if (unicode == NULL)
5204 return NULL;
5205 v = _PyUnicode_AsUTF8String(unicode, errors);
5206 Py_DECREF(unicode);
5207 return v;
5208}
5209
5210PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005211PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005213 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214}
5215
Walter Dörwald41980ca2007-08-16 21:55:45 +00005216/* --- UTF-32 Codec ------------------------------------------------------- */
5217
5218PyObject *
5219PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 Py_ssize_t size,
5221 const char *errors,
5222 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005223{
5224 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5225}
5226
5227PyObject *
5228PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005229 Py_ssize_t size,
5230 const char *errors,
5231 int *byteorder,
5232 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005233{
5234 const char *starts = s;
5235 Py_ssize_t startinpos;
5236 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005237 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005238 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005239 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005240 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005241 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005242 PyObject *errorHandler = NULL;
5243 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005244
Walter Dörwald41980ca2007-08-16 21:55:45 +00005245 q = (unsigned char *)s;
5246 e = q + size;
5247
5248 if (byteorder)
5249 bo = *byteorder;
5250
5251 /* Check for BOM marks (U+FEFF) in the input and adjust current
5252 byte order setting accordingly. In native mode, the leading BOM
5253 mark is skipped, in all other modes, it is copied to the output
5254 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005255 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005256 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005257 if (bom == 0x0000FEFF) {
5258 bo = -1;
5259 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005260 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005261 else if (bom == 0xFFFE0000) {
5262 bo = 1;
5263 q += 4;
5264 }
5265 if (byteorder)
5266 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005267 }
5268
Victor Stinnere64322e2012-10-30 23:12:47 +01005269 if (q == e) {
5270 if (consumed)
5271 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005272 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005273 }
5274
Victor Stinnere64322e2012-10-30 23:12:47 +01005275#ifdef WORDS_BIGENDIAN
5276 le = bo < 0;
5277#else
5278 le = bo <= 0;
5279#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005280 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005281
Victor Stinner8f674cc2013-04-17 23:02:17 +02005282 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005283 writer.min_length = (e - q + 3) / 4;
5284 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005285 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005286
Victor Stinnere64322e2012-10-30 23:12:47 +01005287 while (1) {
5288 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005289 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005290
Victor Stinnere64322e2012-10-30 23:12:47 +01005291 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005292 enum PyUnicode_Kind kind = writer.kind;
5293 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005294 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005295 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005296 if (le) {
5297 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005298 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005299 if (ch > maxch)
5300 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005301 if (kind != PyUnicode_1BYTE_KIND &&
5302 Py_UNICODE_IS_SURROGATE(ch))
5303 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005304 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005305 q += 4;
5306 } while (q <= last);
5307 }
5308 else {
5309 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005310 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005311 if (ch > maxch)
5312 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005313 if (kind != PyUnicode_1BYTE_KIND &&
5314 Py_UNICODE_IS_SURROGATE(ch))
5315 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005316 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005317 q += 4;
5318 } while (q <= last);
5319 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005320 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005321 }
5322
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005323 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005324 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005325 startinpos = ((const char *)q) - starts;
5326 endinpos = startinpos + 4;
5327 }
5328 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005329 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005330 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005331 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005332 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005333 startinpos = ((const char *)q) - starts;
5334 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005335 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005336 else {
5337 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005338 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005339 goto onError;
5340 q += 4;
5341 continue;
5342 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005343 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005344 startinpos = ((const char *)q) - starts;
5345 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005347
5348 /* The remaining input chars are ignored if the callback
5349 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005350 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005352 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005354 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005356 }
5357
Walter Dörwald41980ca2007-08-16 21:55:45 +00005358 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005360
Walter Dörwald41980ca2007-08-16 21:55:45 +00005361 Py_XDECREF(errorHandler);
5362 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005363 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005364
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005366 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005367 Py_XDECREF(errorHandler);
5368 Py_XDECREF(exc);
5369 return NULL;
5370}
5371
5372PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005373_PyUnicode_EncodeUTF32(PyObject *str,
5374 const char *errors,
5375 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005376{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005377 enum PyUnicode_Kind kind;
5378 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005379 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005380 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005381 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005382#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005383 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005384#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005385 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005386#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005387 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005388 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005389 PyObject *errorHandler = NULL;
5390 PyObject *exc = NULL;
5391 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005392
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005393 if (!PyUnicode_Check(str)) {
5394 PyErr_BadArgument();
5395 return NULL;
5396 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005397 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005398 return NULL;
5399 kind = PyUnicode_KIND(str);
5400 data = PyUnicode_DATA(str);
5401 len = PyUnicode_GET_LENGTH(str);
5402
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005403 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005404 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005405 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005406 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005407 if (v == NULL)
5408 return NULL;
5409
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005410 /* output buffer is 4-bytes aligned */
5411 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005412 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005413 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005414 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005415 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005416 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005417
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005418 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005419 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005420 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005421 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005422 else
5423 encoding = "utf-32";
5424
5425 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005426 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5427 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005428 }
5429
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005430 pos = 0;
5431 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005432 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005433
5434 if (kind == PyUnicode_2BYTE_KIND) {
5435 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5436 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005437 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005438 else {
5439 assert(kind == PyUnicode_4BYTE_KIND);
5440 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5441 &out, native_ordering);
5442 }
5443 if (pos == len)
5444 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005445
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005446 rep = unicode_encode_call_errorhandler(
5447 errors, &errorHandler,
5448 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005449 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005450 if (!rep)
5451 goto error;
5452
5453 if (PyBytes_Check(rep)) {
5454 repsize = PyBytes_GET_SIZE(rep);
5455 if (repsize & 3) {
5456 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005457 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005458 "surrogates not allowed");
5459 goto error;
5460 }
5461 moreunits = repsize / 4;
5462 }
5463 else {
5464 assert(PyUnicode_Check(rep));
5465 if (PyUnicode_READY(rep) < 0)
5466 goto error;
5467 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5468 if (!PyUnicode_IS_ASCII(rep)) {
5469 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005470 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005471 "surrogates not allowed");
5472 goto error;
5473 }
5474 }
5475
5476 /* four bytes are reserved for each surrogate */
5477 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005478 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005479 Py_ssize_t morebytes = 4 * (moreunits - 1);
5480 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5481 /* integer overflow */
5482 PyErr_NoMemory();
5483 goto error;
5484 }
5485 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5486 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005487 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005488 }
5489
5490 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005491 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005492 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005493 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005494 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005495 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5496 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005497 }
5498
5499 Py_CLEAR(rep);
5500 }
5501
5502 /* Cut back to size actually needed. This is necessary for, for example,
5503 encoding of a string containing isolated surrogates and the 'ignore'
5504 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005505 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005506 if (nsize != PyBytes_GET_SIZE(v))
5507 _PyBytes_Resize(&v, nsize);
5508 Py_XDECREF(errorHandler);
5509 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005510 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005511 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005512 error:
5513 Py_XDECREF(rep);
5514 Py_XDECREF(errorHandler);
5515 Py_XDECREF(exc);
5516 Py_XDECREF(v);
5517 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005518}
5519
Alexander Belopolsky40018472011-02-26 01:02:56 +00005520PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005521PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5522 Py_ssize_t size,
5523 const char *errors,
5524 int byteorder)
5525{
5526 PyObject *result;
5527 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5528 if (tmp == NULL)
5529 return NULL;
5530 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5531 Py_DECREF(tmp);
5532 return result;
5533}
5534
5535PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005536PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005537{
Victor Stinnerb960b342011-11-20 19:12:52 +01005538 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005539}
5540
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541/* --- UTF-16 Codec ------------------------------------------------------- */
5542
Tim Peters772747b2001-08-09 22:21:55 +00005543PyObject *
5544PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005545 Py_ssize_t size,
5546 const char *errors,
5547 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548{
Walter Dörwald69652032004-09-07 20:24:22 +00005549 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5550}
5551
5552PyObject *
5553PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005554 Py_ssize_t size,
5555 const char *errors,
5556 int *byteorder,
5557 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005558{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005559 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005560 Py_ssize_t startinpos;
5561 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005562 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005563 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005564 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005565 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005566 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005567 PyObject *errorHandler = NULL;
5568 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005569 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570
Tim Peters772747b2001-08-09 22:21:55 +00005571 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005572 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573
5574 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005575 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005577 /* Check for BOM marks (U+FEFF) in the input and adjust current
5578 byte order setting accordingly. In native mode, the leading BOM
5579 mark is skipped, in all other modes, it is copied to the output
5580 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005581 if (bo == 0 && size >= 2) {
5582 const Py_UCS4 bom = (q[1] << 8) | q[0];
5583 if (bom == 0xFEFF) {
5584 q += 2;
5585 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005587 else if (bom == 0xFFFE) {
5588 q += 2;
5589 bo = 1;
5590 }
5591 if (byteorder)
5592 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005593 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594
Antoine Pitrou63065d72012-05-15 23:48:04 +02005595 if (q == e) {
5596 if (consumed)
5597 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005598 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005599 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005600
Christian Heimes743e0cd2012-10-17 23:52:17 +02005601#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005602 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005603 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005604#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005605 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005606 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005607#endif
Tim Peters772747b2001-08-09 22:21:55 +00005608
Antoine Pitrou63065d72012-05-15 23:48:04 +02005609 /* Note: size will always be longer than the resulting Unicode
5610 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005611 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005612 writer.min_length = (e - q + 1) / 2;
5613 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005614 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005615
Antoine Pitrou63065d72012-05-15 23:48:04 +02005616 while (1) {
5617 Py_UCS4 ch = 0;
5618 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005619 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005620 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005621 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005622 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005623 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005624 native_ordering);
5625 else
5626 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005627 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005628 native_ordering);
5629 } else if (kind == PyUnicode_2BYTE_KIND) {
5630 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005631 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005632 native_ordering);
5633 } else {
5634 assert(kind == PyUnicode_4BYTE_KIND);
5635 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005636 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005637 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005638 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005639 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005640
Antoine Pitrou63065d72012-05-15 23:48:04 +02005641 switch (ch)
5642 {
5643 case 0:
5644 /* remaining byte at the end? (size should be even) */
5645 if (q == e || consumed)
5646 goto End;
5647 errmsg = "truncated data";
5648 startinpos = ((const char *)q) - starts;
5649 endinpos = ((const char *)e) - starts;
5650 break;
5651 /* The remaining input chars are ignored if the callback
5652 chooses to skip the input */
5653 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005654 q -= 2;
5655 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005656 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005657 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005658 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005659 endinpos = ((const char *)e) - starts;
5660 break;
5661 case 2:
5662 errmsg = "illegal encoding";
5663 startinpos = ((const char *)q) - 2 - starts;
5664 endinpos = startinpos + 2;
5665 break;
5666 case 3:
5667 errmsg = "illegal UTF-16 surrogate";
5668 startinpos = ((const char *)q) - 4 - starts;
5669 endinpos = startinpos + 2;
5670 break;
5671 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005672 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005673 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 continue;
5675 }
5676
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005677 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005678 errors,
5679 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005680 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005681 &starts,
5682 (const char **)&e,
5683 &startinpos,
5684 &endinpos,
5685 &exc,
5686 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005687 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 }
5690
Antoine Pitrou63065d72012-05-15 23:48:04 +02005691End:
Walter Dörwald69652032004-09-07 20:24:22 +00005692 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005694
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005695 Py_XDECREF(errorHandler);
5696 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005697 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005700 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701 Py_XDECREF(errorHandler);
5702 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 return NULL;
5704}
5705
Tim Peters772747b2001-08-09 22:21:55 +00005706PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005707_PyUnicode_EncodeUTF16(PyObject *str,
5708 const char *errors,
5709 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005711 enum PyUnicode_Kind kind;
5712 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005713 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005714 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005715 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005716 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005717#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005718 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005719#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005720 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005721#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005722 const char *encoding;
5723 Py_ssize_t nsize, pos;
5724 PyObject *errorHandler = NULL;
5725 PyObject *exc = NULL;
5726 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005727
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005728 if (!PyUnicode_Check(str)) {
5729 PyErr_BadArgument();
5730 return NULL;
5731 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005732 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005733 return NULL;
5734 kind = PyUnicode_KIND(str);
5735 data = PyUnicode_DATA(str);
5736 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005737
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005738 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005739 if (kind == PyUnicode_4BYTE_KIND) {
5740 const Py_UCS4 *in = (const Py_UCS4 *)data;
5741 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005742 while (in < end) {
5743 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005744 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005745 }
5746 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005747 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005748 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005750 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005751 nsize = len + pairs + (byteorder == 0);
5752 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005753 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005755 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005757 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005758 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005759 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005760 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005761 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005762 }
5763 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005764 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005765 }
Tim Peters772747b2001-08-09 22:21:55 +00005766
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005767 if (kind == PyUnicode_1BYTE_KIND) {
5768 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5769 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005770 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005771
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005772 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005773 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005774 }
5775 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005776 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005777 }
5778 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005779 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005780 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005781
5782 pos = 0;
5783 while (pos < len) {
5784 Py_ssize_t repsize, moreunits;
5785
5786 if (kind == PyUnicode_2BYTE_KIND) {
5787 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5788 &out, native_ordering);
5789 }
5790 else {
5791 assert(kind == PyUnicode_4BYTE_KIND);
5792 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5793 &out, native_ordering);
5794 }
5795 if (pos == len)
5796 break;
5797
5798 rep = unicode_encode_call_errorhandler(
5799 errors, &errorHandler,
5800 encoding, "surrogates not allowed",
5801 str, &exc, pos, pos + 1, &pos);
5802 if (!rep)
5803 goto error;
5804
5805 if (PyBytes_Check(rep)) {
5806 repsize = PyBytes_GET_SIZE(rep);
5807 if (repsize & 1) {
5808 raise_encode_exception(&exc, encoding,
5809 str, pos - 1, pos,
5810 "surrogates not allowed");
5811 goto error;
5812 }
5813 moreunits = repsize / 2;
5814 }
5815 else {
5816 assert(PyUnicode_Check(rep));
5817 if (PyUnicode_READY(rep) < 0)
5818 goto error;
5819 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5820 if (!PyUnicode_IS_ASCII(rep)) {
5821 raise_encode_exception(&exc, encoding,
5822 str, pos - 1, pos,
5823 "surrogates not allowed");
5824 goto error;
5825 }
5826 }
5827
5828 /* two bytes are reserved for each surrogate */
5829 if (moreunits > 1) {
5830 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5831 Py_ssize_t morebytes = 2 * (moreunits - 1);
5832 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5833 /* integer overflow */
5834 PyErr_NoMemory();
5835 goto error;
5836 }
5837 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5838 goto error;
5839 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5840 }
5841
5842 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005843 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005844 out += moreunits;
5845 } else /* rep is unicode */ {
5846 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5847 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5848 &out, native_ordering);
5849 }
5850
5851 Py_CLEAR(rep);
5852 }
5853
5854 /* Cut back to size actually needed. This is necessary for, for example,
5855 encoding of a string containing isolated surrogates and the 'ignore' handler
5856 is used. */
5857 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5858 if (nsize != PyBytes_GET_SIZE(v))
5859 _PyBytes_Resize(&v, nsize);
5860 Py_XDECREF(errorHandler);
5861 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005862 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005863 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005864 error:
5865 Py_XDECREF(rep);
5866 Py_XDECREF(errorHandler);
5867 Py_XDECREF(exc);
5868 Py_XDECREF(v);
5869 return NULL;
5870#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871}
5872
Alexander Belopolsky40018472011-02-26 01:02:56 +00005873PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005874PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5875 Py_ssize_t size,
5876 const char *errors,
5877 int byteorder)
5878{
5879 PyObject *result;
5880 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5881 if (tmp == NULL)
5882 return NULL;
5883 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5884 Py_DECREF(tmp);
5885 return result;
5886}
5887
5888PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005889PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005891 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892}
5893
5894/* --- Unicode Escape Codec ----------------------------------------------- */
5895
Fredrik Lundh06d12682001-01-24 07:59:11 +00005896static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005897
Alexander Belopolsky40018472011-02-26 01:02:56 +00005898PyObject *
5899PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005900 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005901 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005903 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005904 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906 PyObject *errorHandler = NULL;
5907 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005908
Victor Stinner62ec3312016-09-06 17:04:34 -07005909 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005910 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005911 }
5912 /* Escaped strings will always be longer than the resulting
5913 Unicode string, so we start with size here and then reduce the
5914 length after conversion to the true value.
5915 (but if the error callback returns a long replacement string
5916 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005917 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005918 writer.min_length = size;
5919 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5920 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005921 }
5922
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 end = s + size;
5924 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005925 unsigned char c = (unsigned char) *s++;
5926 Py_UCS4 ch;
5927 int count;
5928 Py_ssize_t startinpos;
5929 Py_ssize_t endinpos;
5930 const char *message;
5931
5932#define WRITE_ASCII_CHAR(ch) \
5933 do { \
5934 assert(ch <= 127); \
5935 assert(writer.pos < writer.size); \
5936 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5937 } while(0)
5938
5939#define WRITE_CHAR(ch) \
5940 do { \
5941 if (ch <= writer.maxchar) { \
5942 assert(writer.pos < writer.size); \
5943 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5944 } \
5945 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5946 goto onError; \
5947 } \
5948 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949
5950 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005951 if (c != '\\') {
5952 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 continue;
5954 }
5955
Victor Stinner62ec3312016-09-06 17:04:34 -07005956 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005958 if (s >= end) {
5959 message = "\\ at end of string";
5960 goto error;
5961 }
5962 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005963
Victor Stinner62ec3312016-09-06 17:04:34 -07005964 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005965 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966
Benjamin Peterson29060642009-01-31 22:14:21 +00005967 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005968 case '\n': continue;
5969 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5970 case '\'': WRITE_ASCII_CHAR('\''); continue;
5971 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5972 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005973 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005974 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5975 case 't': WRITE_ASCII_CHAR('\t'); continue;
5976 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5977 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005978 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005979 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005980 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005981 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 case '0': case '1': case '2': case '3':
5985 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005986 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005987 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005988 ch = (ch<<3) + *s++ - '0';
5989 if (s < end && '0' <= *s && *s <= '7') {
5990 ch = (ch<<3) + *s++ - '0';
5991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005993 WRITE_CHAR(ch);
5994 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 /* hex escapes */
5997 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07005999 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006000 message = "truncated \\xXX escape";
6001 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006005 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006006 message = "truncated \\uXXXX escape";
6007 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006010 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006011 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006012 message = "truncated \\UXXXXXXXX escape";
6013 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006014 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006015 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006016 ch <<= 4;
6017 if (c >= '0' && c <= '9') {
6018 ch += c - '0';
6019 }
6020 else if (c >= 'a' && c <= 'f') {
6021 ch += c - ('a' - 10);
6022 }
6023 else if (c >= 'A' && c <= 'F') {
6024 ch += c - ('A' - 10);
6025 }
6026 else {
6027 break;
6028 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006029 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006030 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006031 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006032 }
6033
6034 /* when we get here, ch is a 32-bit unicode character */
6035 if (ch > MAX_UNICODE) {
6036 message = "illegal Unicode character";
6037 goto error;
6038 }
6039
6040 WRITE_CHAR(ch);
6041 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006042
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006044 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006045 if (ucnhash_CAPI == NULL) {
6046 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006047 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6048 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006049 if (ucnhash_CAPI == NULL) {
6050 PyErr_SetString(
6051 PyExc_UnicodeError,
6052 "\\N escapes not supported (can't load unicodedata module)"
6053 );
6054 goto onError;
6055 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006056 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006057
6058 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006059 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006060 const char *start = ++s;
6061 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006062 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006063 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006064 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006065 namelen = s - start;
6066 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006067 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006068 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006069 ch = 0xffffffff; /* in case 'getcode' messes up */
6070 if (namelen <= INT_MAX &&
6071 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6072 &ch, 0)) {
6073 assert(ch <= MAX_UNICODE);
6074 WRITE_CHAR(ch);
6075 continue;
6076 }
6077 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006078 }
6079 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006080 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006081
6082 default:
R David Murray110b6fe2016-09-08 15:34:08 -04006083 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6084 "invalid escape sequence '\\%c'", c) < 0)
6085 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006086 WRITE_ASCII_CHAR('\\');
6087 WRITE_CHAR(c);
6088 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006090
6091 error:
6092 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006093 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006094 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006095 errors, &errorHandler,
6096 "unicodeescape", message,
6097 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006098 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006099 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006100 }
6101 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6102 goto onError;
6103 }
6104
6105#undef WRITE_ASCII_CHAR
6106#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006108
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006109 Py_XDECREF(errorHandler);
6110 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006111 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006112
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006114 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006115 Py_XDECREF(errorHandler);
6116 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 return NULL;
6118}
6119
6120/* Return a Unicode-Escape string version of the Unicode object.
6121
6122 If quotes is true, the string is enclosed in u"" or u'' quotes as
6123 appropriate.
6124
6125*/
6126
Alexander Belopolsky40018472011-02-26 01:02:56 +00006127PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006128PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006130 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006131 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006133 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006134 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006135 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136
Ezio Melottie7f90372012-10-05 03:33:31 +03006137 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006138 escape.
6139
Ezio Melottie7f90372012-10-05 03:33:31 +03006140 For UCS1 strings it's '\xxx', 4 bytes per source character.
6141 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6142 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006143 */
6144
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006145 if (!PyUnicode_Check(unicode)) {
6146 PyErr_BadArgument();
6147 return NULL;
6148 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006149 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006150 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006151 }
Victor Stinner358af132015-10-12 22:36:57 +02006152
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006153 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006154 if (len == 0) {
6155 return PyBytes_FromStringAndSize(NULL, 0);
6156 }
6157
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006158 kind = PyUnicode_KIND(unicode);
6159 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006160 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6161 bytes, and 1 byte characters 4. */
6162 expandsize = kind * 2 + 2;
6163 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) {
6164 return PyErr_NoMemory();
6165 }
6166 repr = PyBytes_FromStringAndSize(NULL, 2 + expandsize * len + 1);
6167 if (repr == NULL) {
6168 return NULL;
6169 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006170
Victor Stinner62ec3312016-09-06 17:04:34 -07006171 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006173 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006174
Victor Stinner62ec3312016-09-06 17:04:34 -07006175 /* U+0000-U+00ff range */
6176 if (ch < 0x100) {
6177 if (ch >= ' ' && ch < 127) {
6178 if (ch != '\\') {
6179 /* Copy printable US ASCII as-is */
6180 *p++ = (char) ch;
6181 }
6182 /* Escape backslashes */
6183 else {
6184 *p++ = '\\';
6185 *p++ = '\\';
6186 }
6187 }
Victor Stinner358af132015-10-12 22:36:57 +02006188
Victor Stinner62ec3312016-09-06 17:04:34 -07006189 /* Map special whitespace to '\t', \n', '\r' */
6190 else if (ch == '\t') {
6191 *p++ = '\\';
6192 *p++ = 't';
6193 }
6194 else if (ch == '\n') {
6195 *p++ = '\\';
6196 *p++ = 'n';
6197 }
6198 else if (ch == '\r') {
6199 *p++ = '\\';
6200 *p++ = 'r';
6201 }
6202
6203 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6204 else {
6205 *p++ = '\\';
6206 *p++ = 'x';
6207 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6208 *p++ = Py_hexdigits[ch & 0x000F];
6209 }
Tim Petersced69f82003-09-16 20:30:58 +00006210 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006211 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6212 else if (ch < 0x10000) {
6213 /* U+0100-U+ffff */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 *p++ = '\\';
6215 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006216 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6217 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6218 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6219 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006221 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6222 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006223
Victor Stinner62ec3312016-09-06 17:04:34 -07006224 /* Make sure that the first two digits are zero */
6225 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006226 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006227 *p++ = 'U';
6228 *p++ = '0';
6229 *p++ = '0';
6230 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6231 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6232 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6233 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6234 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6235 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238
Victor Stinner62ec3312016-09-06 17:04:34 -07006239 assert(p - PyBytes_AS_STRING(repr) > 0);
6240 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6241 return NULL;
6242 }
6243 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244}
6245
Alexander Belopolsky40018472011-02-26 01:02:56 +00006246PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006247PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6248 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006250 PyObject *result;
6251 PyObject *tmp = PyUnicode_FromUnicode(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006252 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006254 }
6255
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006256 result = PyUnicode_AsUnicodeEscapeString(tmp);
6257 Py_DECREF(tmp);
6258 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259}
6260
6261/* --- Raw Unicode Escape Codec ------------------------------------------- */
6262
Alexander Belopolsky40018472011-02-26 01:02:56 +00006263PyObject *
6264PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006265 Py_ssize_t size,
6266 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006268 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006269 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006271 PyObject *errorHandler = NULL;
6272 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006273
Victor Stinner62ec3312016-09-06 17:04:34 -07006274 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006275 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006276 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006277
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278 /* Escaped strings will always be longer than the resulting
6279 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006280 length after conversion to the true value. (But decoding error
6281 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006282 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006283 writer.min_length = size;
6284 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6285 goto onError;
6286 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006287
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288 end = s + size;
6289 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006290 unsigned char c = (unsigned char) *s++;
6291 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006292 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006293 Py_ssize_t startinpos;
6294 Py_ssize_t endinpos;
6295 const char *message;
6296
6297#define WRITE_CHAR(ch) \
6298 do { \
6299 if (ch <= writer.maxchar) { \
6300 assert(writer.pos < writer.size); \
6301 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6302 } \
6303 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6304 goto onError; \
6305 } \
6306 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006309 if (c != '\\' || s >= end) {
6310 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006311 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006312 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006313
Victor Stinner62ec3312016-09-06 17:04:34 -07006314 c = (unsigned char) *s++;
6315 if (c == 'u') {
6316 count = 4;
6317 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006318 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006319 else if (c == 'U') {
6320 count = 8;
6321 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006322 }
6323 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006324 assert(writer.pos < writer.size);
6325 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6326 WRITE_CHAR(c);
6327 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006328 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006329 startinpos = s - starts - 2;
6330
6331 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6332 for (ch = 0; count && s < end; ++s, --count) {
6333 c = (unsigned char)*s;
6334 ch <<= 4;
6335 if (c >= '0' && c <= '9') {
6336 ch += c - '0';
6337 }
6338 else if (c >= 'a' && c <= 'f') {
6339 ch += c - ('a' - 10);
6340 }
6341 else if (c >= 'A' && c <= 'F') {
6342 ch += c - ('A' - 10);
6343 }
6344 else {
6345 break;
6346 }
6347 }
6348 if (!count) {
6349 if (ch <= MAX_UNICODE) {
6350 WRITE_CHAR(ch);
6351 continue;
6352 }
6353 message = "\\Uxxxxxxxx out of range";
6354 }
6355
6356 endinpos = s-starts;
6357 writer.min_length = end - s + writer.pos;
6358 if (unicode_decode_call_errorhandler_writer(
6359 errors, &errorHandler,
6360 "rawunicodeescape", message,
6361 &starts, &end, &startinpos, &endinpos, &exc, &s,
6362 &writer)) {
6363 goto onError;
6364 }
6365 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6366 goto onError;
6367 }
6368
6369#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006371 Py_XDECREF(errorHandler);
6372 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006373 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006374
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006376 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006377 Py_XDECREF(errorHandler);
6378 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006380
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381}
6382
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006383
Alexander Belopolsky40018472011-02-26 01:02:56 +00006384PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006385PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386{
Victor Stinner62ec3312016-09-06 17:04:34 -07006387 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006389 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006390 int kind;
6391 void *data;
6392 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006394 if (!PyUnicode_Check(unicode)) {
6395 PyErr_BadArgument();
6396 return NULL;
6397 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006398 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006399 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006400 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006401 kind = PyUnicode_KIND(unicode);
6402 data = PyUnicode_DATA(unicode);
6403 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006404 if (kind == PyUnicode_1BYTE_KIND) {
6405 return PyBytes_FromStringAndSize(data, len);
6406 }
Victor Stinner0e368262011-11-10 20:12:49 +01006407
Victor Stinner62ec3312016-09-06 17:04:34 -07006408 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6409 bytes, and 1 byte characters 4. */
6410 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006411
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 if (len > PY_SSIZE_T_MAX / expandsize) {
6413 return PyErr_NoMemory();
6414 }
6415 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6416 if (repr == NULL) {
6417 return NULL;
6418 }
6419 if (len == 0) {
6420 return repr;
6421 }
6422
6423 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006424 for (pos = 0; pos < len; pos++) {
6425 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006426
Victor Stinner62ec3312016-09-06 17:04:34 -07006427 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6428 if (ch < 0x100) {
6429 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006430 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006431 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6432 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 *p++ = '\\';
6434 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006435 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6436 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6437 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6438 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006440 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6441 else {
6442 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6443 *p++ = '\\';
6444 *p++ = 'U';
6445 *p++ = '0';
6446 *p++ = '0';
6447 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6448 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6449 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6450 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6451 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6452 *p++ = Py_hexdigits[ch & 15];
6453 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006455
Victor Stinner62ec3312016-09-06 17:04:34 -07006456 assert(p > PyBytes_AS_STRING(repr));
6457 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6458 return NULL;
6459 }
6460 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461}
6462
Alexander Belopolsky40018472011-02-26 01:02:56 +00006463PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006464PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6465 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006467 PyObject *result;
6468 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6469 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006470 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006471 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6472 Py_DECREF(tmp);
6473 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474}
6475
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006476/* --- Unicode Internal Codec ------------------------------------------- */
6477
Alexander Belopolsky40018472011-02-26 01:02:56 +00006478PyObject *
6479_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006480 Py_ssize_t size,
6481 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006482{
6483 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006484 Py_ssize_t startinpos;
6485 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006486 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006487 const char *end;
6488 const char *reason;
6489 PyObject *errorHandler = NULL;
6490 PyObject *exc = NULL;
6491
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006492 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006493 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006494 1))
6495 return NULL;
6496
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006497 if (size == 0)
6498 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006499
Victor Stinner8f674cc2013-04-17 23:02:17 +02006500 _PyUnicodeWriter_Init(&writer);
6501 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6502 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006504 }
6505 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006506
Victor Stinner8f674cc2013-04-17 23:02:17 +02006507 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006508 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006509 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006510 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006511 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006512 endinpos = end-starts;
6513 reason = "truncated input";
6514 goto error;
6515 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006516 /* We copy the raw representation one byte at a time because the
6517 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006518 ((char *) &uch)[0] = s[0];
6519 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006520#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006521 ((char *) &uch)[2] = s[2];
6522 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006523#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006524 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006525#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006526 /* We have to sanity check the raw data, otherwise doom looms for
6527 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006528 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006529 endinpos = s - starts + Py_UNICODE_SIZE;
6530 reason = "illegal code point (> 0x10FFFF)";
6531 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006532 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006533#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006534 s += Py_UNICODE_SIZE;
6535#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006536 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006537 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006538 Py_UNICODE uch2;
6539 ((char *) &uch2)[0] = s[0];
6540 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006541 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006542 {
Victor Stinner551ac952011-11-29 22:58:13 +01006543 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006544 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006545 }
6546 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006547#endif
6548
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006549 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006550 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006551 continue;
6552
6553 error:
6554 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006555 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006556 errors, &errorHandler,
6557 "unicode_internal", reason,
6558 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006559 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006560 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006561 }
6562
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006563 Py_XDECREF(errorHandler);
6564 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006565 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006566
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006568 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006569 Py_XDECREF(errorHandler);
6570 Py_XDECREF(exc);
6571 return NULL;
6572}
6573
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574/* --- Latin-1 Codec ------------------------------------------------------ */
6575
Alexander Belopolsky40018472011-02-26 01:02:56 +00006576PyObject *
6577PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006578 Py_ssize_t size,
6579 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006582 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583}
6584
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006585/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006586static void
6587make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006588 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006589 PyObject *unicode,
6590 Py_ssize_t startpos, Py_ssize_t endpos,
6591 const char *reason)
6592{
6593 if (*exceptionObject == NULL) {
6594 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006595 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006596 encoding, unicode, startpos, endpos, reason);
6597 }
6598 else {
6599 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6600 goto onError;
6601 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6602 goto onError;
6603 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6604 goto onError;
6605 return;
6606 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006607 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006608 }
6609}
6610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006611/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006612static void
6613raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006614 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006615 PyObject *unicode,
6616 Py_ssize_t startpos, Py_ssize_t endpos,
6617 const char *reason)
6618{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006619 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006620 encoding, unicode, startpos, endpos, reason);
6621 if (*exceptionObject != NULL)
6622 PyCodec_StrictErrors(*exceptionObject);
6623}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006624
6625/* error handling callback helper:
6626 build arguments, call the callback and check the arguments,
6627 put the result into newpos and return the replacement string, which
6628 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006629static PyObject *
6630unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006631 PyObject **errorHandler,
6632 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006633 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006634 Py_ssize_t startpos, Py_ssize_t endpos,
6635 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006637 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006638 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006639 PyObject *restuple;
6640 PyObject *resunicode;
6641
6642 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006646 }
6647
Benjamin Petersonbac79492012-01-14 13:34:47 -05006648 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649 return NULL;
6650 len = PyUnicode_GET_LENGTH(unicode);
6651
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006652 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006653 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006654 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006656
6657 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006659 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006661 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006662 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006663 Py_DECREF(restuple);
6664 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006666 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 &resunicode, newpos)) {
6668 Py_DECREF(restuple);
6669 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006670 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006671 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6672 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6673 Py_DECREF(restuple);
6674 return NULL;
6675 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006677 *newpos = len + *newpos;
6678 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006679 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006680 Py_DECREF(restuple);
6681 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006682 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006683 Py_INCREF(resunicode);
6684 Py_DECREF(restuple);
6685 return resunicode;
6686}
6687
Alexander Belopolsky40018472011-02-26 01:02:56 +00006688static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006689unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006690 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006691 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006692{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006693 /* input state */
6694 Py_ssize_t pos=0, size;
6695 int kind;
6696 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006697 /* pointer into the output */
6698 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006699 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6700 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006701 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006702 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006703 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006704 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006705 /* output object */
6706 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707
Benjamin Petersonbac79492012-01-14 13:34:47 -05006708 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006709 return NULL;
6710 size = PyUnicode_GET_LENGTH(unicode);
6711 kind = PyUnicode_KIND(unicode);
6712 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 /* allocate enough for a simple encoding without
6714 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006715 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006716 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006717
6718 _PyBytesWriter_Init(&writer);
6719 str = _PyBytesWriter_Alloc(&writer, size);
6720 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006721 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006723 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006724 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006727 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006729 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006730 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006731 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006732 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006733 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006735 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006736 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006738
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006739 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006741
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006742 /* Only overallocate the buffer if it's not the last write */
6743 writer.overallocate = (collend < size);
6744
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006746 if (error_handler == _Py_ERROR_UNKNOWN)
6747 error_handler = get_error_handler(errors);
6748
6749 switch (error_handler) {
6750 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006751 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006753
6754 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006755 memset(str, '?', collend - collstart);
6756 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006757 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006758 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 break;
Victor Stinner50149202015-09-22 00:26:54 +02006761
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006762 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006763 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006764 writer.min_size -= (collend - collstart);
6765 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006766 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006767 if (str == NULL)
6768 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006769 pos = collend;
6770 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006771
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006772 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006773 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006774 writer.min_size -= (collend - collstart);
6775 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006776 unicode, collstart, collend);
6777 if (str == NULL)
6778 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006779 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006780 break;
Victor Stinner50149202015-09-22 00:26:54 +02006781
Victor Stinnerc3713e92015-09-29 12:32:13 +02006782 case _Py_ERROR_SURROGATEESCAPE:
6783 for (i = collstart; i < collend; ++i) {
6784 ch = PyUnicode_READ(kind, data, i);
6785 if (ch < 0xdc80 || 0xdcff < ch) {
6786 /* Not a UTF-8b surrogate */
6787 break;
6788 }
6789 *str++ = (char)(ch - 0xdc00);
6790 ++pos;
6791 }
6792 if (i >= collend)
6793 break;
6794 collstart = pos;
6795 assert(collstart != collend);
6796 /* fallback to general error handling */
6797
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006799 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6800 encoding, reason, unicode, &exc,
6801 collstart, collend, &newpos);
6802 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006803 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006804
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006805 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006806 writer.min_size -= 1;
6807
Victor Stinner6bd525b2015-10-09 13:10:05 +02006808 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006809 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006810 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006811 PyBytes_AS_STRING(rep),
6812 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006813 if (str == NULL)
6814 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006815 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006816 else {
6817 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006818
Victor Stinner6bd525b2015-10-09 13:10:05 +02006819 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006821
6822 if (PyUnicode_IS_ASCII(rep)) {
6823 /* Fast path: all characters are smaller than limit */
6824 assert(limit >= 128);
6825 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6826 str = _PyBytesWriter_WriteBytes(&writer, str,
6827 PyUnicode_DATA(rep),
6828 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006830 else {
6831 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6832
6833 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6834 if (str == NULL)
6835 goto onError;
6836
6837 /* check if there is anything unencodable in the
6838 replacement and copy it to the output */
6839 for (i = 0; repsize-->0; ++i, ++str) {
6840 ch = PyUnicode_READ_CHAR(rep, i);
6841 if (ch >= limit) {
6842 raise_encode_exception(&exc, encoding, unicode,
6843 pos, pos+1, reason);
6844 goto onError;
6845 }
6846 *str = (char)ch;
6847 }
6848 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006849 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006850 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006851 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006852 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006853
6854 /* If overallocation was disabled, ensure that it was the last
6855 write. Otherwise, we missed an optimization */
6856 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006857 }
6858 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006859
Victor Stinner50149202015-09-22 00:26:54 +02006860 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006861 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006862 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006863
6864 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006865 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006866 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006867 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006868 Py_XDECREF(exc);
6869 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006870}
6871
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006872/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006873PyObject *
6874PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006875 Py_ssize_t size,
6876 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006878 PyObject *result;
6879 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6880 if (unicode == NULL)
6881 return NULL;
6882 result = unicode_encode_ucs1(unicode, errors, 256);
6883 Py_DECREF(unicode);
6884 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885}
6886
Alexander Belopolsky40018472011-02-26 01:02:56 +00006887PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006888_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889{
6890 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 PyErr_BadArgument();
6892 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006894 if (PyUnicode_READY(unicode) == -1)
6895 return NULL;
6896 /* Fast path: if it is a one-byte string, construct
6897 bytes object directly. */
6898 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6899 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6900 PyUnicode_GET_LENGTH(unicode));
6901 /* Non-Latin-1 characters present. Defer to above function to
6902 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006903 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006904}
6905
6906PyObject*
6907PyUnicode_AsLatin1String(PyObject *unicode)
6908{
6909 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910}
6911
6912/* --- 7-bit ASCII Codec -------------------------------------------------- */
6913
Alexander Belopolsky40018472011-02-26 01:02:56 +00006914PyObject *
6915PyUnicode_DecodeASCII(const char *s,
6916 Py_ssize_t size,
6917 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006919 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006920 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006921 int kind;
6922 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006923 Py_ssize_t startinpos;
6924 Py_ssize_t endinpos;
6925 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006926 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006927 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006928 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006929 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006930
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006932 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006933
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006935 if (size == 1 && (unsigned char)s[0] < 128)
6936 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006937
Victor Stinner8f674cc2013-04-17 23:02:17 +02006938 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006939 writer.min_length = size;
6940 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006941 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006942
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006943 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006944 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006945 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006946 writer.pos = outpos;
6947 if (writer.pos == size)
6948 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006949
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006950 s += writer.pos;
6951 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006952 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006953 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006955 PyUnicode_WRITE(kind, data, writer.pos, c);
6956 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006958 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006960
6961 /* byte outsize range 0x00..0x7f: call the error handler */
6962
6963 if (error_handler == _Py_ERROR_UNKNOWN)
6964 error_handler = get_error_handler(errors);
6965
6966 switch (error_handler)
6967 {
6968 case _Py_ERROR_REPLACE:
6969 case _Py_ERROR_SURROGATEESCAPE:
6970 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006971 but we may switch to UCS2 at the first write */
6972 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6973 goto onError;
6974 kind = writer.kind;
6975 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006976
6977 if (error_handler == _Py_ERROR_REPLACE)
6978 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6979 else
6980 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6981 writer.pos++;
6982 ++s;
6983 break;
6984
6985 case _Py_ERROR_IGNORE:
6986 ++s;
6987 break;
6988
6989 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006990 startinpos = s-starts;
6991 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006992 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006993 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006994 "ascii", "ordinal not in range(128)",
6995 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006996 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006998 kind = writer.kind;
6999 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007000 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007002 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007003 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007004 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007005
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007007 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007008 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007009 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010 return NULL;
7011}
7012
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007013/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007014PyObject *
7015PyUnicode_EncodeASCII(const Py_UNICODE *p,
7016 Py_ssize_t size,
7017 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007019 PyObject *result;
7020 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7021 if (unicode == NULL)
7022 return NULL;
7023 result = unicode_encode_ucs1(unicode, errors, 128);
7024 Py_DECREF(unicode);
7025 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026}
7027
Alexander Belopolsky40018472011-02-26 01:02:56 +00007028PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007029_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030{
7031 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 PyErr_BadArgument();
7033 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007035 if (PyUnicode_READY(unicode) == -1)
7036 return NULL;
7037 /* Fast path: if it is an ASCII-only string, construct bytes object
7038 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007039 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007040 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7041 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007042 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007043}
7044
7045PyObject *
7046PyUnicode_AsASCIIString(PyObject *unicode)
7047{
7048 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049}
7050
Steve Dowercc16be82016-09-08 10:35:16 -07007051#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007052
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007053/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007054
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007055#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007056#define NEED_RETRY
7057#endif
7058
Victor Stinner3a50e702011-10-18 21:21:00 +02007059#ifndef WC_ERR_INVALID_CHARS
7060# define WC_ERR_INVALID_CHARS 0x0080
7061#endif
7062
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007063static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007064code_page_name(UINT code_page, PyObject **obj)
7065{
7066 *obj = NULL;
7067 if (code_page == CP_ACP)
7068 return "mbcs";
7069 if (code_page == CP_UTF7)
7070 return "CP_UTF7";
7071 if (code_page == CP_UTF8)
7072 return "CP_UTF8";
7073
7074 *obj = PyBytes_FromFormat("cp%u", code_page);
7075 if (*obj == NULL)
7076 return NULL;
7077 return PyBytes_AS_STRING(*obj);
7078}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007079
Victor Stinner3a50e702011-10-18 21:21:00 +02007080static DWORD
7081decode_code_page_flags(UINT code_page)
7082{
7083 if (code_page == CP_UTF7) {
7084 /* The CP_UTF7 decoder only supports flags=0 */
7085 return 0;
7086 }
7087 else
7088 return MB_ERR_INVALID_CHARS;
7089}
7090
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007091/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007092 * Decode a byte string from a Windows code page into unicode object in strict
7093 * mode.
7094 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007095 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7096 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007097 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007098static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007099decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007100 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007101 const char *in,
7102 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007103{
Victor Stinner3a50e702011-10-18 21:21:00 +02007104 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007105 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007106 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107
7108 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007109 assert(insize > 0);
7110 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7111 if (outsize <= 0)
7112 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113
7114 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007116 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007117 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007118 if (*v == NULL)
7119 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007120 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007121 }
7122 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007123 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007124 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007125 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007126 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007127 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007128 }
7129
7130 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007131 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7132 if (outsize <= 0)
7133 goto error;
7134 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007135
Victor Stinner3a50e702011-10-18 21:21:00 +02007136error:
7137 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7138 return -2;
7139 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007140 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007141}
7142
Victor Stinner3a50e702011-10-18 21:21:00 +02007143/*
7144 * Decode a byte string from a code page into unicode object with an error
7145 * handler.
7146 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007147 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007148 * UnicodeDecodeError exception and returns -1 on error.
7149 */
7150static int
7151decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007152 PyObject **v,
7153 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007154 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007155{
7156 const char *startin = in;
7157 const char *endin = in + size;
7158 const DWORD flags = decode_code_page_flags(code_page);
7159 /* Ideally, we should get reason from FormatMessage. This is the Windows
7160 2000 English version of the message. */
7161 const char *reason = "No mapping for the Unicode character exists "
7162 "in the target code page.";
7163 /* each step cannot decode more than 1 character, but a character can be
7164 represented as a surrogate pair */
7165 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007166 int insize;
7167 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 PyObject *errorHandler = NULL;
7169 PyObject *exc = NULL;
7170 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007171 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007172 DWORD err;
7173 int ret = -1;
7174
7175 assert(size > 0);
7176
7177 encoding = code_page_name(code_page, &encoding_obj);
7178 if (encoding == NULL)
7179 return -1;
7180
Victor Stinner7d00cc12014-03-17 23:08:06 +01007181 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7183 UnicodeDecodeError. */
7184 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7185 if (exc != NULL) {
7186 PyCodec_StrictErrors(exc);
7187 Py_CLEAR(exc);
7188 }
7189 goto error;
7190 }
7191
7192 if (*v == NULL) {
7193 /* Create unicode object */
7194 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7195 PyErr_NoMemory();
7196 goto error;
7197 }
Victor Stinnerab595942011-12-17 04:59:06 +01007198 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007199 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007200 if (*v == NULL)
7201 goto error;
7202 startout = PyUnicode_AS_UNICODE(*v);
7203 }
7204 else {
7205 /* Extend unicode object */
7206 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7207 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7208 PyErr_NoMemory();
7209 goto error;
7210 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007211 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 goto error;
7213 startout = PyUnicode_AS_UNICODE(*v) + n;
7214 }
7215
7216 /* Decode the byte string character per character */
7217 out = startout;
7218 while (in < endin)
7219 {
7220 /* Decode a character */
7221 insize = 1;
7222 do
7223 {
7224 outsize = MultiByteToWideChar(code_page, flags,
7225 in, insize,
7226 buffer, Py_ARRAY_LENGTH(buffer));
7227 if (outsize > 0)
7228 break;
7229 err = GetLastError();
7230 if (err != ERROR_NO_UNICODE_TRANSLATION
7231 && err != ERROR_INSUFFICIENT_BUFFER)
7232 {
7233 PyErr_SetFromWindowsErr(0);
7234 goto error;
7235 }
7236 insize++;
7237 }
7238 /* 4=maximum length of a UTF-8 sequence */
7239 while (insize <= 4 && (in + insize) <= endin);
7240
7241 if (outsize <= 0) {
7242 Py_ssize_t startinpos, endinpos, outpos;
7243
Victor Stinner7d00cc12014-03-17 23:08:06 +01007244 /* last character in partial decode? */
7245 if (in + insize >= endin && !final)
7246 break;
7247
Victor Stinner3a50e702011-10-18 21:21:00 +02007248 startinpos = in - startin;
7249 endinpos = startinpos + 1;
7250 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007251 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007252 errors, &errorHandler,
7253 encoding, reason,
7254 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007255 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007256 {
7257 goto error;
7258 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007259 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007260 }
7261 else {
7262 in += insize;
7263 memcpy(out, buffer, outsize * sizeof(wchar_t));
7264 out += outsize;
7265 }
7266 }
7267
7268 /* write a NUL character at the end */
7269 *out = 0;
7270
7271 /* Extend unicode object */
7272 outsize = out - startout;
7273 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007274 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007275 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007276 /* (in - startin) <= size and size is an int */
7277 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007278
7279error:
7280 Py_XDECREF(encoding_obj);
7281 Py_XDECREF(errorHandler);
7282 Py_XDECREF(exc);
7283 return ret;
7284}
7285
Victor Stinner3a50e702011-10-18 21:21:00 +02007286static PyObject *
7287decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007288 const char *s, Py_ssize_t size,
7289 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007290{
Victor Stinner76a31a62011-11-04 00:05:13 +01007291 PyObject *v = NULL;
7292 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007293
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 if (code_page < 0) {
7295 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7296 return NULL;
7297 }
7298
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007299 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007300 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007301
Victor Stinner76a31a62011-11-04 00:05:13 +01007302 do
7303 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007304#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007305 if (size > INT_MAX) {
7306 chunk_size = INT_MAX;
7307 final = 0;
7308 done = 0;
7309 }
7310 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007311#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007312 {
7313 chunk_size = (int)size;
7314 final = (consumed == NULL);
7315 done = 1;
7316 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007317
Victor Stinner76a31a62011-11-04 00:05:13 +01007318 if (chunk_size == 0 && done) {
7319 if (v != NULL)
7320 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007321 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007322 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007323
Victor Stinner76a31a62011-11-04 00:05:13 +01007324 converted = decode_code_page_strict(code_page, &v,
7325 s, chunk_size);
7326 if (converted == -2)
7327 converted = decode_code_page_errors(code_page, &v,
7328 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007329 errors, final);
7330 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007331
7332 if (converted < 0) {
7333 Py_XDECREF(v);
7334 return NULL;
7335 }
7336
7337 if (consumed)
7338 *consumed += converted;
7339
7340 s += converted;
7341 size -= converted;
7342 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007343
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007344 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007345}
7346
Alexander Belopolsky40018472011-02-26 01:02:56 +00007347PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007348PyUnicode_DecodeCodePageStateful(int code_page,
7349 const char *s,
7350 Py_ssize_t size,
7351 const char *errors,
7352 Py_ssize_t *consumed)
7353{
7354 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7355}
7356
7357PyObject *
7358PyUnicode_DecodeMBCSStateful(const char *s,
7359 Py_ssize_t size,
7360 const char *errors,
7361 Py_ssize_t *consumed)
7362{
7363 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7364}
7365
7366PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007367PyUnicode_DecodeMBCS(const char *s,
7368 Py_ssize_t size,
7369 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007370{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007371 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7372}
7373
Victor Stinner3a50e702011-10-18 21:21:00 +02007374static DWORD
7375encode_code_page_flags(UINT code_page, const char *errors)
7376{
7377 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007378 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007379 }
7380 else if (code_page == CP_UTF7) {
7381 /* CP_UTF7 only supports flags=0 */
7382 return 0;
7383 }
7384 else {
7385 if (errors != NULL && strcmp(errors, "replace") == 0)
7386 return 0;
7387 else
7388 return WC_NO_BEST_FIT_CHARS;
7389 }
7390}
7391
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007392/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007393 * Encode a Unicode string to a Windows code page into a byte string in strict
7394 * mode.
7395 *
7396 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007397 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007398 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007399static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007400encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007401 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007402 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007403{
Victor Stinner554f3f02010-06-16 23:33:54 +00007404 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007405 BOOL *pusedDefaultChar = &usedDefaultChar;
7406 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007407 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007408 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007409 const DWORD flags = encode_code_page_flags(code_page, NULL);
7410 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007411 /* Create a substring so that we can get the UTF-16 representation
7412 of just the slice under consideration. */
7413 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007414
Martin v. Löwis3d325192011-11-04 18:23:06 +01007415 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007416
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007418 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007419 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007420 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007421
Victor Stinner2fc507f2011-11-04 20:06:39 +01007422 substring = PyUnicode_Substring(unicode, offset, offset+len);
7423 if (substring == NULL)
7424 return -1;
7425 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7426 if (p == NULL) {
7427 Py_DECREF(substring);
7428 return -1;
7429 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007430 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007431
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007432 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007434 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007435 NULL, 0,
7436 NULL, pusedDefaultChar);
7437 if (outsize <= 0)
7438 goto error;
7439 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007440 if (pusedDefaultChar && *pusedDefaultChar) {
7441 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007442 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007443 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007444
Victor Stinner3a50e702011-10-18 21:21:00 +02007445 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007447 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007448 if (*outbytes == NULL) {
7449 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007451 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007453 }
7454 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007456 const Py_ssize_t n = PyBytes_Size(*outbytes);
7457 if (outsize > PY_SSIZE_T_MAX - n) {
7458 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007459 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007462 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7463 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007464 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007465 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007467 }
7468
7469 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007470 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007471 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007472 out, outsize,
7473 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007474 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 if (outsize <= 0)
7476 goto error;
7477 if (pusedDefaultChar && *pusedDefaultChar)
7478 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007479 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007480
Victor Stinner3a50e702011-10-18 21:21:00 +02007481error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007482 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7484 return -2;
7485 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007486 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007487}
7488
Victor Stinner3a50e702011-10-18 21:21:00 +02007489/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007490 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 * error handler.
7492 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007493 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007494 * -1 on other error.
7495 */
7496static int
7497encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007498 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007499 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007500{
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007502 Py_ssize_t pos = unicode_offset;
7503 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007504 /* Ideally, we should get reason from FormatMessage. This is the Windows
7505 2000 English version of the message. */
7506 const char *reason = "invalid character";
7507 /* 4=maximum length of a UTF-8 sequence */
7508 char buffer[4];
7509 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7510 Py_ssize_t outsize;
7511 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007512 PyObject *errorHandler = NULL;
7513 PyObject *exc = NULL;
7514 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007515 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007516 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007517 PyObject *rep;
7518 int ret = -1;
7519
7520 assert(insize > 0);
7521
7522 encoding = code_page_name(code_page, &encoding_obj);
7523 if (encoding == NULL)
7524 return -1;
7525
7526 if (errors == NULL || strcmp(errors, "strict") == 0) {
7527 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7528 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007529 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007530 if (exc != NULL) {
7531 PyCodec_StrictErrors(exc);
7532 Py_DECREF(exc);
7533 }
7534 Py_XDECREF(encoding_obj);
7535 return -1;
7536 }
7537
7538 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7539 pusedDefaultChar = &usedDefaultChar;
7540 else
7541 pusedDefaultChar = NULL;
7542
7543 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7544 PyErr_NoMemory();
7545 goto error;
7546 }
7547 outsize = insize * Py_ARRAY_LENGTH(buffer);
7548
7549 if (*outbytes == NULL) {
7550 /* Create string object */
7551 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7552 if (*outbytes == NULL)
7553 goto error;
7554 out = PyBytes_AS_STRING(*outbytes);
7555 }
7556 else {
7557 /* Extend string object */
7558 Py_ssize_t n = PyBytes_Size(*outbytes);
7559 if (n > PY_SSIZE_T_MAX - outsize) {
7560 PyErr_NoMemory();
7561 goto error;
7562 }
7563 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7564 goto error;
7565 out = PyBytes_AS_STRING(*outbytes) + n;
7566 }
7567
7568 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007569 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007570 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007571 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7572 wchar_t chars[2];
7573 int charsize;
7574 if (ch < 0x10000) {
7575 chars[0] = (wchar_t)ch;
7576 charsize = 1;
7577 }
7578 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007579 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7580 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007581 charsize = 2;
7582 }
7583
Victor Stinner3a50e702011-10-18 21:21:00 +02007584 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007585 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007586 buffer, Py_ARRAY_LENGTH(buffer),
7587 NULL, pusedDefaultChar);
7588 if (outsize > 0) {
7589 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7590 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007591 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007592 memcpy(out, buffer, outsize);
7593 out += outsize;
7594 continue;
7595 }
7596 }
7597 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7598 PyErr_SetFromWindowsErr(0);
7599 goto error;
7600 }
7601
Victor Stinner3a50e702011-10-18 21:21:00 +02007602 rep = unicode_encode_call_errorhandler(
7603 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007604 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007605 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007606 if (rep == NULL)
7607 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007608 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007609
7610 if (PyBytes_Check(rep)) {
7611 outsize = PyBytes_GET_SIZE(rep);
7612 if (outsize != 1) {
7613 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7614 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7615 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7616 Py_DECREF(rep);
7617 goto error;
7618 }
7619 out = PyBytes_AS_STRING(*outbytes) + offset;
7620 }
7621 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7622 out += outsize;
7623 }
7624 else {
7625 Py_ssize_t i;
7626 enum PyUnicode_Kind kind;
7627 void *data;
7628
Benjamin Petersonbac79492012-01-14 13:34:47 -05007629 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007630 Py_DECREF(rep);
7631 goto error;
7632 }
7633
7634 outsize = PyUnicode_GET_LENGTH(rep);
7635 if (outsize != 1) {
7636 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7637 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7638 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7639 Py_DECREF(rep);
7640 goto error;
7641 }
7642 out = PyBytes_AS_STRING(*outbytes) + offset;
7643 }
7644 kind = PyUnicode_KIND(rep);
7645 data = PyUnicode_DATA(rep);
7646 for (i=0; i < outsize; i++) {
7647 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7648 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007649 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007650 encoding, unicode,
7651 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 "unable to encode error handler result to ASCII");
7653 Py_DECREF(rep);
7654 goto error;
7655 }
7656 *out = (unsigned char)ch;
7657 out++;
7658 }
7659 }
7660 Py_DECREF(rep);
7661 }
7662 /* write a NUL byte */
7663 *out = 0;
7664 outsize = out - PyBytes_AS_STRING(*outbytes);
7665 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7666 if (_PyBytes_Resize(outbytes, outsize) < 0)
7667 goto error;
7668 ret = 0;
7669
7670error:
7671 Py_XDECREF(encoding_obj);
7672 Py_XDECREF(errorHandler);
7673 Py_XDECREF(exc);
7674 return ret;
7675}
7676
Victor Stinner3a50e702011-10-18 21:21:00 +02007677static PyObject *
7678encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007679 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007680 const char *errors)
7681{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007682 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007683 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007684 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007685 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007686
Victor Stinner29dacf22015-01-26 16:41:32 +01007687 if (!PyUnicode_Check(unicode)) {
7688 PyErr_BadArgument();
7689 return NULL;
7690 }
7691
Benjamin Petersonbac79492012-01-14 13:34:47 -05007692 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007693 return NULL;
7694 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007695
Victor Stinner3a50e702011-10-18 21:21:00 +02007696 if (code_page < 0) {
7697 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7698 return NULL;
7699 }
7700
Martin v. Löwis3d325192011-11-04 18:23:06 +01007701 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007702 return PyBytes_FromStringAndSize(NULL, 0);
7703
Victor Stinner7581cef2011-11-03 22:32:33 +01007704 offset = 0;
7705 do
7706 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007707#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007708 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007709 chunks. */
7710 if (len > INT_MAX/2) {
7711 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007712 done = 0;
7713 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007714 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007715#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007716 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007717 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007718 done = 1;
7719 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007720
Victor Stinner76a31a62011-11-04 00:05:13 +01007721 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007722 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007723 errors);
7724 if (ret == -2)
7725 ret = encode_code_page_errors(code_page, &outbytes,
7726 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007727 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007728 if (ret < 0) {
7729 Py_XDECREF(outbytes);
7730 return NULL;
7731 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007732
Victor Stinner7581cef2011-11-03 22:32:33 +01007733 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007734 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007735 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007736
Victor Stinner3a50e702011-10-18 21:21:00 +02007737 return outbytes;
7738}
7739
7740PyObject *
7741PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7742 Py_ssize_t size,
7743 const char *errors)
7744{
Victor Stinner7581cef2011-11-03 22:32:33 +01007745 PyObject *unicode, *res;
7746 unicode = PyUnicode_FromUnicode(p, size);
7747 if (unicode == NULL)
7748 return NULL;
7749 res = encode_code_page(CP_ACP, unicode, errors);
7750 Py_DECREF(unicode);
7751 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007752}
7753
7754PyObject *
7755PyUnicode_EncodeCodePage(int code_page,
7756 PyObject *unicode,
7757 const char *errors)
7758{
Victor Stinner7581cef2011-11-03 22:32:33 +01007759 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007760}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007761
Alexander Belopolsky40018472011-02-26 01:02:56 +00007762PyObject *
7763PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007764{
Victor Stinner7581cef2011-11-03 22:32:33 +01007765 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007766}
7767
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007768#undef NEED_RETRY
7769
Steve Dowercc16be82016-09-08 10:35:16 -07007770#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007771
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772/* --- Character Mapping Codec -------------------------------------------- */
7773
Victor Stinnerfb161b12013-04-18 01:44:27 +02007774static int
7775charmap_decode_string(const char *s,
7776 Py_ssize_t size,
7777 PyObject *mapping,
7778 const char *errors,
7779 _PyUnicodeWriter *writer)
7780{
7781 const char *starts = s;
7782 const char *e;
7783 Py_ssize_t startinpos, endinpos;
7784 PyObject *errorHandler = NULL, *exc = NULL;
7785 Py_ssize_t maplen;
7786 enum PyUnicode_Kind mapkind;
7787 void *mapdata;
7788 Py_UCS4 x;
7789 unsigned char ch;
7790
7791 if (PyUnicode_READY(mapping) == -1)
7792 return -1;
7793
7794 maplen = PyUnicode_GET_LENGTH(mapping);
7795 mapdata = PyUnicode_DATA(mapping);
7796 mapkind = PyUnicode_KIND(mapping);
7797
7798 e = s + size;
7799
7800 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7801 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7802 * is disabled in encoding aliases, latin1 is preferred because
7803 * its implementation is faster. */
7804 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7805 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7806 Py_UCS4 maxchar = writer->maxchar;
7807
7808 assert (writer->kind == PyUnicode_1BYTE_KIND);
7809 while (s < e) {
7810 ch = *s;
7811 x = mapdata_ucs1[ch];
7812 if (x > maxchar) {
7813 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7814 goto onError;
7815 maxchar = writer->maxchar;
7816 outdata = (Py_UCS1 *)writer->data;
7817 }
7818 outdata[writer->pos] = x;
7819 writer->pos++;
7820 ++s;
7821 }
7822 return 0;
7823 }
7824
7825 while (s < e) {
7826 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7827 enum PyUnicode_Kind outkind = writer->kind;
7828 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7829 if (outkind == PyUnicode_1BYTE_KIND) {
7830 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7831 Py_UCS4 maxchar = writer->maxchar;
7832 while (s < e) {
7833 ch = *s;
7834 x = mapdata_ucs2[ch];
7835 if (x > maxchar)
7836 goto Error;
7837 outdata[writer->pos] = x;
7838 writer->pos++;
7839 ++s;
7840 }
7841 break;
7842 }
7843 else if (outkind == PyUnicode_2BYTE_KIND) {
7844 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7845 while (s < e) {
7846 ch = *s;
7847 x = mapdata_ucs2[ch];
7848 if (x == 0xFFFE)
7849 goto Error;
7850 outdata[writer->pos] = x;
7851 writer->pos++;
7852 ++s;
7853 }
7854 break;
7855 }
7856 }
7857 ch = *s;
7858
7859 if (ch < maplen)
7860 x = PyUnicode_READ(mapkind, mapdata, ch);
7861 else
7862 x = 0xfffe; /* invalid value */
7863Error:
7864 if (x == 0xfffe)
7865 {
7866 /* undefined mapping */
7867 startinpos = s-starts;
7868 endinpos = startinpos+1;
7869 if (unicode_decode_call_errorhandler_writer(
7870 errors, &errorHandler,
7871 "charmap", "character maps to <undefined>",
7872 &starts, &e, &startinpos, &endinpos, &exc, &s,
7873 writer)) {
7874 goto onError;
7875 }
7876 continue;
7877 }
7878
7879 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7880 goto onError;
7881 ++s;
7882 }
7883 Py_XDECREF(errorHandler);
7884 Py_XDECREF(exc);
7885 return 0;
7886
7887onError:
7888 Py_XDECREF(errorHandler);
7889 Py_XDECREF(exc);
7890 return -1;
7891}
7892
7893static int
7894charmap_decode_mapping(const char *s,
7895 Py_ssize_t size,
7896 PyObject *mapping,
7897 const char *errors,
7898 _PyUnicodeWriter *writer)
7899{
7900 const char *starts = s;
7901 const char *e;
7902 Py_ssize_t startinpos, endinpos;
7903 PyObject *errorHandler = NULL, *exc = NULL;
7904 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007905 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007906
7907 e = s + size;
7908
7909 while (s < e) {
7910 ch = *s;
7911
7912 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7913 key = PyLong_FromLong((long)ch);
7914 if (key == NULL)
7915 goto onError;
7916
7917 item = PyObject_GetItem(mapping, key);
7918 Py_DECREF(key);
7919 if (item == NULL) {
7920 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7921 /* No mapping found means: mapping is undefined. */
7922 PyErr_Clear();
7923 goto Undefined;
7924 } else
7925 goto onError;
7926 }
7927
7928 /* Apply mapping */
7929 if (item == Py_None)
7930 goto Undefined;
7931 if (PyLong_Check(item)) {
7932 long value = PyLong_AS_LONG(item);
7933 if (value == 0xFFFE)
7934 goto Undefined;
7935 if (value < 0 || value > MAX_UNICODE) {
7936 PyErr_Format(PyExc_TypeError,
7937 "character mapping must be in range(0x%lx)",
7938 (unsigned long)MAX_UNICODE + 1);
7939 goto onError;
7940 }
7941
7942 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7943 goto onError;
7944 }
7945 else if (PyUnicode_Check(item)) {
7946 if (PyUnicode_READY(item) == -1)
7947 goto onError;
7948 if (PyUnicode_GET_LENGTH(item) == 1) {
7949 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7950 if (value == 0xFFFE)
7951 goto Undefined;
7952 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7953 goto onError;
7954 }
7955 else {
7956 writer->overallocate = 1;
7957 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7958 goto onError;
7959 }
7960 }
7961 else {
7962 /* wrong return value */
7963 PyErr_SetString(PyExc_TypeError,
7964 "character mapping must return integer, None or str");
7965 goto onError;
7966 }
7967 Py_CLEAR(item);
7968 ++s;
7969 continue;
7970
7971Undefined:
7972 /* undefined mapping */
7973 Py_CLEAR(item);
7974 startinpos = s-starts;
7975 endinpos = startinpos+1;
7976 if (unicode_decode_call_errorhandler_writer(
7977 errors, &errorHandler,
7978 "charmap", "character maps to <undefined>",
7979 &starts, &e, &startinpos, &endinpos, &exc, &s,
7980 writer)) {
7981 goto onError;
7982 }
7983 }
7984 Py_XDECREF(errorHandler);
7985 Py_XDECREF(exc);
7986 return 0;
7987
7988onError:
7989 Py_XDECREF(item);
7990 Py_XDECREF(errorHandler);
7991 Py_XDECREF(exc);
7992 return -1;
7993}
7994
Alexander Belopolsky40018472011-02-26 01:02:56 +00007995PyObject *
7996PyUnicode_DecodeCharmap(const char *s,
7997 Py_ssize_t size,
7998 PyObject *mapping,
7999 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008001 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008002
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003 /* Default to Latin-1 */
8004 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008008 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008009 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008010 writer.min_length = size;
8011 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008013
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008014 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008015 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8016 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008017 }
8018 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008019 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8020 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008022 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008023
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008025 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026 return NULL;
8027}
8028
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008029/* Charmap encoding: the lookup table */
8030
Alexander Belopolsky40018472011-02-26 01:02:56 +00008031struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 PyObject_HEAD
8033 unsigned char level1[32];
8034 int count2, count3;
8035 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008036};
8037
8038static PyObject*
8039encoding_map_size(PyObject *obj, PyObject* args)
8040{
8041 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008042 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008044}
8045
8046static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008047 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 PyDoc_STR("Return the size (in bytes) of this object") },
8049 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008050};
8051
8052static void
8053encoding_map_dealloc(PyObject* o)
8054{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008055 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008056}
8057
8058static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008059 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 "EncodingMap", /*tp_name*/
8061 sizeof(struct encoding_map), /*tp_basicsize*/
8062 0, /*tp_itemsize*/
8063 /* methods */
8064 encoding_map_dealloc, /*tp_dealloc*/
8065 0, /*tp_print*/
8066 0, /*tp_getattr*/
8067 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008068 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 0, /*tp_repr*/
8070 0, /*tp_as_number*/
8071 0, /*tp_as_sequence*/
8072 0, /*tp_as_mapping*/
8073 0, /*tp_hash*/
8074 0, /*tp_call*/
8075 0, /*tp_str*/
8076 0, /*tp_getattro*/
8077 0, /*tp_setattro*/
8078 0, /*tp_as_buffer*/
8079 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8080 0, /*tp_doc*/
8081 0, /*tp_traverse*/
8082 0, /*tp_clear*/
8083 0, /*tp_richcompare*/
8084 0, /*tp_weaklistoffset*/
8085 0, /*tp_iter*/
8086 0, /*tp_iternext*/
8087 encoding_map_methods, /*tp_methods*/
8088 0, /*tp_members*/
8089 0, /*tp_getset*/
8090 0, /*tp_base*/
8091 0, /*tp_dict*/
8092 0, /*tp_descr_get*/
8093 0, /*tp_descr_set*/
8094 0, /*tp_dictoffset*/
8095 0, /*tp_init*/
8096 0, /*tp_alloc*/
8097 0, /*tp_new*/
8098 0, /*tp_free*/
8099 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008100};
8101
8102PyObject*
8103PyUnicode_BuildEncodingMap(PyObject* string)
8104{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008105 PyObject *result;
8106 struct encoding_map *mresult;
8107 int i;
8108 int need_dict = 0;
8109 unsigned char level1[32];
8110 unsigned char level2[512];
8111 unsigned char *mlevel1, *mlevel2, *mlevel3;
8112 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008113 int kind;
8114 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008115 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008116 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008117
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008118 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008119 PyErr_BadArgument();
8120 return NULL;
8121 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008122 kind = PyUnicode_KIND(string);
8123 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008124 length = PyUnicode_GET_LENGTH(string);
8125 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008126 memset(level1, 0xFF, sizeof level1);
8127 memset(level2, 0xFF, sizeof level2);
8128
8129 /* If there isn't a one-to-one mapping of NULL to \0,
8130 or if there are non-BMP characters, we need to use
8131 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008133 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008134 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008135 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008136 ch = PyUnicode_READ(kind, data, i);
8137 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008138 need_dict = 1;
8139 break;
8140 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008141 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008142 /* unmapped character */
8143 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008144 l1 = ch >> 11;
8145 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008146 if (level1[l1] == 0xFF)
8147 level1[l1] = count2++;
8148 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008149 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150 }
8151
8152 if (count2 >= 0xFF || count3 >= 0xFF)
8153 need_dict = 1;
8154
8155 if (need_dict) {
8156 PyObject *result = PyDict_New();
8157 PyObject *key, *value;
8158 if (!result)
8159 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008160 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008161 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008162 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008163 if (!key || !value)
8164 goto failed1;
8165 if (PyDict_SetItem(result, key, value) == -1)
8166 goto failed1;
8167 Py_DECREF(key);
8168 Py_DECREF(value);
8169 }
8170 return result;
8171 failed1:
8172 Py_XDECREF(key);
8173 Py_XDECREF(value);
8174 Py_DECREF(result);
8175 return NULL;
8176 }
8177
8178 /* Create a three-level trie */
8179 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8180 16*count2 + 128*count3 - 1);
8181 if (!result)
8182 return PyErr_NoMemory();
8183 PyObject_Init(result, &EncodingMapType);
8184 mresult = (struct encoding_map*)result;
8185 mresult->count2 = count2;
8186 mresult->count3 = count3;
8187 mlevel1 = mresult->level1;
8188 mlevel2 = mresult->level23;
8189 mlevel3 = mresult->level23 + 16*count2;
8190 memcpy(mlevel1, level1, 32);
8191 memset(mlevel2, 0xFF, 16*count2);
8192 memset(mlevel3, 0, 128*count3);
8193 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008194 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008195 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008196 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8197 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008198 /* unmapped character */
8199 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008200 o1 = ch>>11;
8201 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008202 i2 = 16*mlevel1[o1] + o2;
8203 if (mlevel2[i2] == 0xFF)
8204 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008205 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008206 i3 = 128*mlevel2[i2] + o3;
8207 mlevel3[i3] = i;
8208 }
8209 return result;
8210}
8211
8212static int
Victor Stinner22168992011-11-20 17:09:18 +01008213encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008214{
8215 struct encoding_map *map = (struct encoding_map*)mapping;
8216 int l1 = c>>11;
8217 int l2 = (c>>7) & 0xF;
8218 int l3 = c & 0x7F;
8219 int i;
8220
Victor Stinner22168992011-11-20 17:09:18 +01008221 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008222 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008223 if (c == 0)
8224 return 0;
8225 /* level 1*/
8226 i = map->level1[l1];
8227 if (i == 0xFF) {
8228 return -1;
8229 }
8230 /* level 2*/
8231 i = map->level23[16*i+l2];
8232 if (i == 0xFF) {
8233 return -1;
8234 }
8235 /* level 3 */
8236 i = map->level23[16*map->count2 + 128*i + l3];
8237 if (i == 0) {
8238 return -1;
8239 }
8240 return i;
8241}
8242
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243/* Lookup the character ch in the mapping. If the character
8244 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008245 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008246static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008247charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248{
Christian Heimes217cfd12007-12-02 14:31:20 +00008249 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008250 PyObject *x;
8251
8252 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254 x = PyObject_GetItem(mapping, w);
8255 Py_DECREF(w);
8256 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8258 /* No mapping found means: mapping is undefined. */
8259 PyErr_Clear();
8260 x = Py_None;
8261 Py_INCREF(x);
8262 return x;
8263 } else
8264 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008266 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008268 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 long value = PyLong_AS_LONG(x);
8270 if (value < 0 || value > 255) {
8271 PyErr_SetString(PyExc_TypeError,
8272 "character mapping must be in range(256)");
8273 Py_DECREF(x);
8274 return NULL;
8275 }
8276 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008278 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 /* wrong return value */
8282 PyErr_Format(PyExc_TypeError,
8283 "character mapping must return integer, bytes or None, not %.400s",
8284 x->ob_type->tp_name);
8285 Py_DECREF(x);
8286 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287 }
8288}
8289
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008290static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008291charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008292{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008293 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8294 /* exponentially overallocate to minimize reallocations */
8295 if (requiredsize < 2*outsize)
8296 requiredsize = 2*outsize;
8297 if (_PyBytes_Resize(outobj, requiredsize))
8298 return -1;
8299 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008300}
8301
Benjamin Peterson14339b62009-01-31 16:36:08 +00008302typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008304} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008306 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008307 space is available. Return a new reference to the object that
8308 was put in the output buffer, or Py_None, if the mapping was undefined
8309 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008310 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008311static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008312charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008313 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008314{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008315 PyObject *rep;
8316 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008317 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008318
Christian Heimes90aa7642007-12-19 02:45:37 +00008319 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008320 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008322 if (res == -1)
8323 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008324 if (outsize<requiredsize)
8325 if (charmapencode_resize(outobj, outpos, requiredsize))
8326 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008327 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 outstart[(*outpos)++] = (char)res;
8329 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008330 }
8331
8332 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008335 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 Py_DECREF(rep);
8337 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008338 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 if (PyLong_Check(rep)) {
8340 Py_ssize_t requiredsize = *outpos+1;
8341 if (outsize<requiredsize)
8342 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8343 Py_DECREF(rep);
8344 return enc_EXCEPTION;
8345 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008346 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008348 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 else {
8350 const char *repchars = PyBytes_AS_STRING(rep);
8351 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8352 Py_ssize_t requiredsize = *outpos+repsize;
8353 if (outsize<requiredsize)
8354 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8355 Py_DECREF(rep);
8356 return enc_EXCEPTION;
8357 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008358 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 memcpy(outstart + *outpos, repchars, repsize);
8360 *outpos += repsize;
8361 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008363 Py_DECREF(rep);
8364 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365}
8366
8367/* handle an error in PyUnicode_EncodeCharmap
8368 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008369static int
8370charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008371 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008373 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008374 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008375{
8376 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008377 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008378 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008379 enum PyUnicode_Kind kind;
8380 void *data;
8381 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008383 Py_ssize_t collstartpos = *inpos;
8384 Py_ssize_t collendpos = *inpos+1;
8385 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386 char *encoding = "charmap";
8387 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008388 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008389 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008390 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391
Benjamin Petersonbac79492012-01-14 13:34:47 -05008392 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008393 return -1;
8394 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395 /* find all unencodable characters */
8396 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008397 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008398 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008399 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008400 val = encoding_map_lookup(ch, mapping);
8401 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 break;
8403 ++collendpos;
8404 continue;
8405 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008406
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008407 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8408 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 if (rep==NULL)
8410 return -1;
8411 else if (rep!=Py_None) {
8412 Py_DECREF(rep);
8413 break;
8414 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008415 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417 }
8418 /* cache callback name lookup
8419 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008420 if (*error_handler == _Py_ERROR_UNKNOWN)
8421 *error_handler = get_error_handler(errors);
8422
8423 switch (*error_handler) {
8424 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008425 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008426 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008427
8428 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008429 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 x = charmapencode_output('?', mapping, res, respos);
8431 if (x==enc_EXCEPTION) {
8432 return -1;
8433 }
8434 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008435 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 return -1;
8437 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008438 }
8439 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008440 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008441 *inpos = collendpos;
8442 break;
Victor Stinner50149202015-09-22 00:26:54 +02008443
8444 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008445 /* generate replacement (temporarily (mis)uses p) */
8446 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 char buffer[2+29+1+1];
8448 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008449 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 for (cp = buffer; *cp; ++cp) {
8451 x = charmapencode_output(*cp, mapping, res, respos);
8452 if (x==enc_EXCEPTION)
8453 return -1;
8454 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008455 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 return -1;
8457 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 }
8459 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008460 *inpos = collendpos;
8461 break;
Victor Stinner50149202015-09-22 00:26:54 +02008462
Benjamin Peterson14339b62009-01-31 16:36:08 +00008463 default:
Victor Stinner50149202015-09-22 00:26:54 +02008464 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008465 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008467 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008469 if (PyBytes_Check(repunicode)) {
8470 /* Directly copy bytes result to output. */
8471 Py_ssize_t outsize = PyBytes_Size(*res);
8472 Py_ssize_t requiredsize;
8473 repsize = PyBytes_Size(repunicode);
8474 requiredsize = *respos + repsize;
8475 if (requiredsize > outsize)
8476 /* Make room for all additional bytes. */
8477 if (charmapencode_resize(res, respos, requiredsize)) {
8478 Py_DECREF(repunicode);
8479 return -1;
8480 }
8481 memcpy(PyBytes_AsString(*res) + *respos,
8482 PyBytes_AsString(repunicode), repsize);
8483 *respos += repsize;
8484 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008485 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008486 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008487 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008488 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008489 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008490 Py_DECREF(repunicode);
8491 return -1;
8492 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008493 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008494 data = PyUnicode_DATA(repunicode);
8495 kind = PyUnicode_KIND(repunicode);
8496 for (index = 0; index < repsize; index++) {
8497 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8498 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008500 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 return -1;
8502 }
8503 else if (x==enc_FAILED) {
8504 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008505 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 return -1;
8507 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008508 }
8509 *inpos = newpos;
8510 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008511 }
8512 return 0;
8513}
8514
Alexander Belopolsky40018472011-02-26 01:02:56 +00008515PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008516_PyUnicode_EncodeCharmap(PyObject *unicode,
8517 PyObject *mapping,
8518 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 /* output object */
8521 PyObject *res = NULL;
8522 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008523 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008524 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008526 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008527 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008528 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008529 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008530 void *data;
8531 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532
Benjamin Petersonbac79492012-01-14 13:34:47 -05008533 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008534 return NULL;
8535 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008536 data = PyUnicode_DATA(unicode);
8537 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008538
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539 /* Default to Latin-1 */
8540 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008541 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008543 /* allocate enough for a simple encoding without
8544 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008545 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 if (res == NULL)
8547 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008548 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008551 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008552 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008554 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 if (x==enc_EXCEPTION) /* error */
8556 goto onError;
8557 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008558 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008560 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 &res, &respos)) {
8562 goto onError;
8563 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008564 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 else
8566 /* done with this character => adjust input position */
8567 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008570 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008571 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008572 if (_PyBytes_Resize(&res, respos) < 0)
8573 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008574
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008575 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008576 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008577 return res;
8578
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008580 Py_XDECREF(res);
8581 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008582 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583 return NULL;
8584}
8585
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008586/* Deprecated */
8587PyObject *
8588PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8589 Py_ssize_t size,
8590 PyObject *mapping,
8591 const char *errors)
8592{
8593 PyObject *result;
8594 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8595 if (unicode == NULL)
8596 return NULL;
8597 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8598 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008599 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008600}
8601
Alexander Belopolsky40018472011-02-26 01:02:56 +00008602PyObject *
8603PyUnicode_AsCharmapString(PyObject *unicode,
8604 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605{
8606 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 PyErr_BadArgument();
8608 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008610 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611}
8612
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008613/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008614static void
8615make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008616 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008617 Py_ssize_t startpos, Py_ssize_t endpos,
8618 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008620 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621 *exceptionObject = _PyUnicodeTranslateError_Create(
8622 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623 }
8624 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8626 goto onError;
8627 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8628 goto onError;
8629 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8630 goto onError;
8631 return;
8632 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008633 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634 }
8635}
8636
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008637/* error handling callback helper:
8638 build arguments, call the callback and check the arguments,
8639 put the result into newpos and return the replacement string, which
8640 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008641static PyObject *
8642unicode_translate_call_errorhandler(const char *errors,
8643 PyObject **errorHandler,
8644 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008645 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008646 Py_ssize_t startpos, Py_ssize_t endpos,
8647 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008648{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02008649 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008650
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008651 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008652 PyObject *restuple;
8653 PyObject *resunicode;
8654
8655 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008657 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008659 }
8660
8661 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665
8666 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008671 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 Py_DECREF(restuple);
8673 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 }
8675 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 &resunicode, &i_newpos)) {
8677 Py_DECREF(restuple);
8678 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008680 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008682 else
8683 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008685 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 Py_DECREF(restuple);
8687 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008688 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008689 Py_INCREF(resunicode);
8690 Py_DECREF(restuple);
8691 return resunicode;
8692}
8693
8694/* Lookup the character ch in the mapping and put the result in result,
8695 which must be decrefed by the caller.
8696 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008697static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008699{
Christian Heimes217cfd12007-12-02 14:31:20 +00008700 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701 PyObject *x;
8702
8703 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008705 x = PyObject_GetItem(mapping, w);
8706 Py_DECREF(w);
8707 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8709 /* No mapping found means: use 1:1 mapping. */
8710 PyErr_Clear();
8711 *result = NULL;
8712 return 0;
8713 } else
8714 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008715 }
8716 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 *result = x;
8718 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008719 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008720 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008722 if (value < 0 || value > MAX_UNICODE) {
8723 PyErr_Format(PyExc_ValueError,
8724 "character mapping must be in range(0x%x)",
8725 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 Py_DECREF(x);
8727 return -1;
8728 }
8729 *result = x;
8730 return 0;
8731 }
8732 else if (PyUnicode_Check(x)) {
8733 *result = x;
8734 return 0;
8735 }
8736 else {
8737 /* wrong return value */
8738 PyErr_SetString(PyExc_TypeError,
8739 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008740 Py_DECREF(x);
8741 return -1;
8742 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008743}
Victor Stinner1194ea02014-04-04 19:37:40 +02008744
8745/* lookup the character, write the result into the writer.
8746 Return 1 if the result was written into the writer, return 0 if the mapping
8747 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008748static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008749charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8750 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008751{
Victor Stinner1194ea02014-04-04 19:37:40 +02008752 PyObject *item;
8753
8754 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008755 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008756
8757 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008758 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008759 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008760 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008762 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008763 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008764
8765 if (item == Py_None) {
8766 Py_DECREF(item);
8767 return 0;
8768 }
8769
8770 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008771 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8772 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8773 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008774 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8775 Py_DECREF(item);
8776 return -1;
8777 }
8778 Py_DECREF(item);
8779 return 1;
8780 }
8781
8782 if (!PyUnicode_Check(item)) {
8783 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008785 }
8786
8787 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8788 Py_DECREF(item);
8789 return -1;
8790 }
8791
8792 Py_DECREF(item);
8793 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008794}
8795
Victor Stinner89a76ab2014-04-05 11:44:04 +02008796static int
8797unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8798 Py_UCS1 *translate)
8799{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008800 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008801 int ret = 0;
8802
Victor Stinner89a76ab2014-04-05 11:44:04 +02008803 if (charmaptranslate_lookup(ch, mapping, &item)) {
8804 return -1;
8805 }
8806
8807 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008808 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008809 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008810 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008811 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008812 /* not found => default to 1:1 mapping */
8813 translate[ch] = ch;
8814 return 1;
8815 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008816 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008817 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008818 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8819 used it */
8820 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008821 /* invalid character or character outside ASCII:
8822 skip the fast translate */
8823 goto exit;
8824 }
8825 translate[ch] = (Py_UCS1)replace;
8826 }
8827 else if (PyUnicode_Check(item)) {
8828 Py_UCS4 replace;
8829
8830 if (PyUnicode_READY(item) == -1) {
8831 Py_DECREF(item);
8832 return -1;
8833 }
8834 if (PyUnicode_GET_LENGTH(item) != 1)
8835 goto exit;
8836
8837 replace = PyUnicode_READ_CHAR(item, 0);
8838 if (replace > 127)
8839 goto exit;
8840 translate[ch] = (Py_UCS1)replace;
8841 }
8842 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008843 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008844 goto exit;
8845 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008846 ret = 1;
8847
Benjamin Peterson1365de72014-04-07 20:15:41 -04008848 exit:
8849 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008850 return ret;
8851}
8852
8853/* Fast path for ascii => ascii translation. Return 1 if the whole string
8854 was translated into writer, return 0 if the input string was partially
8855 translated into writer, raise an exception and return -1 on error. */
8856static int
8857unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008858 _PyUnicodeWriter *writer, int ignore,
8859 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008860{
Victor Stinner872b2912014-04-05 14:27:07 +02008861 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008862 Py_ssize_t len;
8863 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008864 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008865
Victor Stinner89a76ab2014-04-05 11:44:04 +02008866 len = PyUnicode_GET_LENGTH(input);
8867
Victor Stinner872b2912014-04-05 14:27:07 +02008868 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008869
8870 in = PyUnicode_1BYTE_DATA(input);
8871 end = in + len;
8872
8873 assert(PyUnicode_IS_ASCII(writer->buffer));
8874 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8875 out = PyUnicode_1BYTE_DATA(writer->buffer);
8876
Victor Stinner872b2912014-04-05 14:27:07 +02008877 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008878 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008879 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008880 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008881 int translate = unicode_fast_translate_lookup(mapping, ch,
8882 ascii_table);
8883 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008884 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008885 if (translate == 0)
8886 goto exit;
8887 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008888 }
Victor Stinner872b2912014-04-05 14:27:07 +02008889 if (ch2 == 0xfe) {
8890 if (ignore)
8891 continue;
8892 goto exit;
8893 }
8894 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008895 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008896 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008897 }
Victor Stinner872b2912014-04-05 14:27:07 +02008898 res = 1;
8899
8900exit:
8901 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008902 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008903 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008904}
8905
Victor Stinner3222da22015-10-01 22:07:32 +02008906static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008907_PyUnicode_TranslateCharmap(PyObject *input,
8908 PyObject *mapping,
8909 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008911 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008912 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913 Py_ssize_t size, i;
8914 int kind;
8915 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008916 _PyUnicodeWriter writer;
8917 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008918 char *reason = "character maps to <undefined>";
8919 PyObject *errorHandler = NULL;
8920 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008921 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008922 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008923
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008925 PyErr_BadArgument();
8926 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929 if (PyUnicode_READY(input) == -1)
8930 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008931 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932 kind = PyUnicode_KIND(input);
8933 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008935 if (size == 0)
8936 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008938 /* allocate enough for a simple 1:1 translation without
8939 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008940 _PyUnicodeWriter_Init(&writer);
8941 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943
Victor Stinner872b2912014-04-05 14:27:07 +02008944 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8945
Victor Stinner33798672016-03-01 21:59:58 +01008946 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008947 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008948 if (PyUnicode_IS_ASCII(input)) {
8949 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8950 if (res < 0) {
8951 _PyUnicodeWriter_Dealloc(&writer);
8952 return NULL;
8953 }
8954 if (res == 1)
8955 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008956 }
Victor Stinner33798672016-03-01 21:59:58 +01008957 else {
8958 i = 0;
8959 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008962 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008963 int translate;
8964 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8965 Py_ssize_t newpos;
8966 /* startpos for collecting untranslatable chars */
8967 Py_ssize_t collstart;
8968 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008969 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970
Victor Stinner1194ea02014-04-04 19:37:40 +02008971 ch = PyUnicode_READ(kind, data, i);
8972 translate = charmaptranslate_output(ch, mapping, &writer);
8973 if (translate < 0)
8974 goto onError;
8975
8976 if (translate != 0) {
8977 /* it worked => adjust input pointer */
8978 ++i;
8979 continue;
8980 }
8981
8982 /* untranslatable character */
8983 collstart = i;
8984 collend = i+1;
8985
8986 /* find all untranslatable characters */
8987 while (collend < size) {
8988 PyObject *x;
8989 ch = PyUnicode_READ(kind, data, collend);
8990 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008991 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008992 Py_XDECREF(x);
8993 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008994 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008995 ++collend;
8996 }
8997
8998 if (ignore) {
8999 i = collend;
9000 }
9001 else {
9002 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9003 reason, input, &exc,
9004 collstart, collend, &newpos);
9005 if (repunicode == NULL)
9006 goto onError;
9007 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009008 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009009 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009010 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009011 Py_DECREF(repunicode);
9012 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009013 }
9014 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009015 Py_XDECREF(exc);
9016 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009017 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018
Benjamin Peterson29060642009-01-31 22:14:21 +00009019 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009020 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009021 Py_XDECREF(exc);
9022 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023 return NULL;
9024}
9025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026/* Deprecated. Use PyUnicode_Translate instead. */
9027PyObject *
9028PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9029 Py_ssize_t size,
9030 PyObject *mapping,
9031 const char *errors)
9032{
Christian Heimes5f520f42012-09-11 14:03:25 +02009033 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034 PyObject *unicode = PyUnicode_FromUnicode(p, size);
9035 if (!unicode)
9036 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009037 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9038 Py_DECREF(unicode);
9039 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009040}
9041
Alexander Belopolsky40018472011-02-26 01:02:56 +00009042PyObject *
9043PyUnicode_Translate(PyObject *str,
9044 PyObject *mapping,
9045 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009047 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009048 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009049 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009050}
Tim Petersced69f82003-09-16 20:30:58 +00009051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009053fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054{
9055 /* No need to call PyUnicode_READY(self) because this function is only
9056 called as a callback from fixup() which does it already. */
9057 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9058 const int kind = PyUnicode_KIND(self);
9059 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009060 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009061 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062 Py_ssize_t i;
9063
9064 for (i = 0; i < len; ++i) {
9065 ch = PyUnicode_READ(kind, data, i);
9066 fixed = 0;
9067 if (ch > 127) {
9068 if (Py_UNICODE_ISSPACE(ch))
9069 fixed = ' ';
9070 else {
9071 const int decimal = Py_UNICODE_TODECIMAL(ch);
9072 if (decimal >= 0)
9073 fixed = '0' + decimal;
9074 }
9075 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009076 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009077 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078 PyUnicode_WRITE(kind, data, i, fixed);
9079 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009080 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009081 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009083 }
9084
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009085 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009086}
9087
9088PyObject *
9089_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9090{
9091 if (!PyUnicode_Check(unicode)) {
9092 PyErr_BadInternalCall();
9093 return NULL;
9094 }
9095 if (PyUnicode_READY(unicode) == -1)
9096 return NULL;
9097 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9098 /* If the string is already ASCII, just return the same string */
9099 Py_INCREF(unicode);
9100 return unicode;
9101 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009102 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009103}
9104
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009105PyObject *
9106PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9107 Py_ssize_t length)
9108{
Victor Stinnerf0124502011-11-21 23:12:56 +01009109 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009110 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009111 Py_UCS4 maxchar;
9112 enum PyUnicode_Kind kind;
9113 void *data;
9114
Victor Stinner99d7ad02012-02-22 13:37:39 +01009115 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009116 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009117 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009118 if (ch > 127) {
9119 int decimal = Py_UNICODE_TODECIMAL(ch);
9120 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009121 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009122 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009123 }
9124 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009125
9126 /* Copy to a new string */
9127 decimal = PyUnicode_New(length, maxchar);
9128 if (decimal == NULL)
9129 return decimal;
9130 kind = PyUnicode_KIND(decimal);
9131 data = PyUnicode_DATA(decimal);
9132 /* Iterate over code points */
9133 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009134 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009135 if (ch > 127) {
9136 int decimal = Py_UNICODE_TODECIMAL(ch);
9137 if (decimal >= 0)
9138 ch = '0' + decimal;
9139 }
9140 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009142 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009143}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009144/* --- Decimal Encoder ---------------------------------------------------- */
9145
Alexander Belopolsky40018472011-02-26 01:02:56 +00009146int
9147PyUnicode_EncodeDecimal(Py_UNICODE *s,
9148 Py_ssize_t length,
9149 char *output,
9150 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009151{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009152 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009153 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009154 enum PyUnicode_Kind kind;
9155 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009156
9157 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009158 PyErr_BadArgument();
9159 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009160 }
9161
Victor Stinner42bf7752011-11-21 22:52:58 +01009162 unicode = PyUnicode_FromUnicode(s, length);
9163 if (unicode == NULL)
9164 return -1;
9165
Benjamin Petersonbac79492012-01-14 13:34:47 -05009166 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009167 Py_DECREF(unicode);
9168 return -1;
9169 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009170 kind = PyUnicode_KIND(unicode);
9171 data = PyUnicode_DATA(unicode);
9172
Victor Stinnerb84d7232011-11-22 01:50:07 +01009173 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009174 PyObject *exc;
9175 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009176 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009177 Py_ssize_t startpos;
9178
9179 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009180
Benjamin Peterson29060642009-01-31 22:14:21 +00009181 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009182 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009183 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009184 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009185 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 decimal = Py_UNICODE_TODECIMAL(ch);
9187 if (decimal >= 0) {
9188 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009189 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 continue;
9191 }
9192 if (0 < ch && ch < 256) {
9193 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009194 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009195 continue;
9196 }
Victor Stinner6345be92011-11-25 20:09:01 +01009197
Victor Stinner42bf7752011-11-21 22:52:58 +01009198 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009199 exc = NULL;
9200 raise_encode_exception(&exc, "decimal", unicode,
9201 startpos, startpos+1,
9202 "invalid decimal Unicode string");
9203 Py_XDECREF(exc);
9204 Py_DECREF(unicode);
9205 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009206 }
9207 /* 0-terminate the output string */
9208 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009209 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009210 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009211}
9212
Guido van Rossumd57fd912000-03-10 22:53:23 +00009213/* --- Helpers ------------------------------------------------------------ */
9214
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009215/* helper macro to fixup start/end slice values */
9216#define ADJUST_INDICES(start, end, len) \
9217 if (end > len) \
9218 end = len; \
9219 else if (end < 0) { \
9220 end += len; \
9221 if (end < 0) \
9222 end = 0; \
9223 } \
9224 if (start < 0) { \
9225 start += len; \
9226 if (start < 0) \
9227 start = 0; \
9228 }
9229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009231any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009232 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009233 Py_ssize_t end,
9234 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009235{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009236 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237 void *buf1, *buf2;
9238 Py_ssize_t len1, len2, result;
9239
9240 kind1 = PyUnicode_KIND(s1);
9241 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009242 if (kind1 < kind2)
9243 return -1;
9244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245 len1 = PyUnicode_GET_LENGTH(s1);
9246 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009247 ADJUST_INDICES(start, end, len1);
9248 if (end - start < len2)
9249 return -1;
9250
9251 buf1 = PyUnicode_DATA(s1);
9252 buf2 = PyUnicode_DATA(s2);
9253 if (len2 == 1) {
9254 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9255 result = findchar((const char *)buf1 + kind1*start,
9256 kind1, end - start, ch, direction);
9257 if (result == -1)
9258 return -1;
9259 else
9260 return start + result;
9261 }
9262
9263 if (kind2 != kind1) {
9264 buf2 = _PyUnicode_AsKind(s2, kind1);
9265 if (!buf2)
9266 return -2;
9267 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268
Victor Stinner794d5672011-10-10 03:21:36 +02009269 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009270 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009271 case PyUnicode_1BYTE_KIND:
9272 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9273 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9274 else
9275 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9276 break;
9277 case PyUnicode_2BYTE_KIND:
9278 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9279 break;
9280 case PyUnicode_4BYTE_KIND:
9281 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9282 break;
9283 default:
9284 assert(0); result = -2;
9285 }
9286 }
9287 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009288 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009289 case PyUnicode_1BYTE_KIND:
9290 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9291 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9292 else
9293 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9294 break;
9295 case PyUnicode_2BYTE_KIND:
9296 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9297 break;
9298 case PyUnicode_4BYTE_KIND:
9299 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9300 break;
9301 default:
9302 assert(0); result = -2;
9303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304 }
9305
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009306 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 PyMem_Free(buf2);
9308
9309 return result;
9310}
9311
9312Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009313_PyUnicode_InsertThousandsGrouping(
9314 PyObject *unicode, Py_ssize_t index,
9315 Py_ssize_t n_buffer,
9316 void *digits, Py_ssize_t n_digits,
9317 Py_ssize_t min_width,
9318 const char *grouping, PyObject *thousands_sep,
9319 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009320{
Victor Stinner41a863c2012-02-24 00:37:51 +01009321 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009322 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009323 Py_ssize_t thousands_sep_len;
9324 Py_ssize_t len;
9325
9326 if (unicode != NULL) {
9327 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009328 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009329 }
9330 else {
9331 kind = PyUnicode_1BYTE_KIND;
9332 data = NULL;
9333 }
9334 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9335 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9336 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9337 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009338 if (thousands_sep_kind < kind) {
9339 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9340 if (!thousands_sep_data)
9341 return -1;
9342 }
9343 else {
9344 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9345 if (!data)
9346 return -1;
9347 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009348 }
9349
Benjamin Petersonead6b532011-12-20 17:23:42 -06009350 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009351 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009352 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009353 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009354 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009355 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009356 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009357 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009358 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009359 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009360 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009361 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009362 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009364 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009365 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009366 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009367 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009368 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009369 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009370 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009371 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009372 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009373 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009374 break;
9375 default:
9376 assert(0);
9377 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009379 if (unicode != NULL && thousands_sep_kind != kind) {
9380 if (thousands_sep_kind < kind)
9381 PyMem_Free(thousands_sep_data);
9382 else
9383 PyMem_Free(data);
9384 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009385 if (unicode == NULL) {
9386 *maxchar = 127;
9387 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009388 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009389 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009390 }
9391 }
9392 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393}
9394
9395
Alexander Belopolsky40018472011-02-26 01:02:56 +00009396Py_ssize_t
9397PyUnicode_Count(PyObject *str,
9398 PyObject *substr,
9399 Py_ssize_t start,
9400 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009402 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009403 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404 void *buf1 = NULL, *buf2 = NULL;
9405 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009406
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009407 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009408 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009409
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009410 kind1 = PyUnicode_KIND(str);
9411 kind2 = PyUnicode_KIND(substr);
9412 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009413 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009414
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009415 len1 = PyUnicode_GET_LENGTH(str);
9416 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009418 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009419 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009420
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009421 buf1 = PyUnicode_DATA(str);
9422 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009423 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009424 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009425 if (!buf2)
9426 goto onError;
9427 }
9428
9429 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009431 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009432 result = asciilib_count(
9433 ((Py_UCS1*)buf1) + start, end - start,
9434 buf2, len2, PY_SSIZE_T_MAX
9435 );
9436 else
9437 result = ucs1lib_count(
9438 ((Py_UCS1*)buf1) + start, end - start,
9439 buf2, len2, PY_SSIZE_T_MAX
9440 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 break;
9442 case PyUnicode_2BYTE_KIND:
9443 result = ucs2lib_count(
9444 ((Py_UCS2*)buf1) + start, end - start,
9445 buf2, len2, PY_SSIZE_T_MAX
9446 );
9447 break;
9448 case PyUnicode_4BYTE_KIND:
9449 result = ucs4lib_count(
9450 ((Py_UCS4*)buf1) + start, end - start,
9451 buf2, len2, PY_SSIZE_T_MAX
9452 );
9453 break;
9454 default:
9455 assert(0); result = 0;
9456 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009457
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009458 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 PyMem_Free(buf2);
9460
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009463 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 PyMem_Free(buf2);
9465 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466}
9467
Alexander Belopolsky40018472011-02-26 01:02:56 +00009468Py_ssize_t
9469PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009470 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009471 Py_ssize_t start,
9472 Py_ssize_t end,
9473 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009475 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009476 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009477
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009478 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479}
9480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481Py_ssize_t
9482PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9483 Py_ssize_t start, Py_ssize_t end,
9484 int direction)
9485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009487 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 if (PyUnicode_READY(str) == -1)
9489 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009490 if (start < 0 || end < 0) {
9491 PyErr_SetString(PyExc_IndexError, "string index out of range");
9492 return -2;
9493 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 if (end > PyUnicode_GET_LENGTH(str))
9495 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009496 if (start >= end)
9497 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009499 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9500 kind, end-start, ch, direction);
9501 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009503 else
9504 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505}
9506
Alexander Belopolsky40018472011-02-26 01:02:56 +00009507static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009508tailmatch(PyObject *self,
9509 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009510 Py_ssize_t start,
9511 Py_ssize_t end,
9512 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009513{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 int kind_self;
9515 int kind_sub;
9516 void *data_self;
9517 void *data_sub;
9518 Py_ssize_t offset;
9519 Py_ssize_t i;
9520 Py_ssize_t end_sub;
9521
9522 if (PyUnicode_READY(self) == -1 ||
9523 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009524 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9527 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009529 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009531 if (PyUnicode_GET_LENGTH(substring) == 0)
9532 return 1;
9533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534 kind_self = PyUnicode_KIND(self);
9535 data_self = PyUnicode_DATA(self);
9536 kind_sub = PyUnicode_KIND(substring);
9537 data_sub = PyUnicode_DATA(substring);
9538 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9539
9540 if (direction > 0)
9541 offset = end;
9542 else
9543 offset = start;
9544
9545 if (PyUnicode_READ(kind_self, data_self, offset) ==
9546 PyUnicode_READ(kind_sub, data_sub, 0) &&
9547 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9548 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9549 /* If both are of the same kind, memcmp is sufficient */
9550 if (kind_self == kind_sub) {
9551 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009552 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009553 data_sub,
9554 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009555 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009557 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 else {
9559 /* We do not need to compare 0 and len(substring)-1 because
9560 the if statement above ensured already that they are equal
9561 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 for (i = 1; i < end_sub; ++i) {
9563 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9564 PyUnicode_READ(kind_sub, data_sub, i))
9565 return 0;
9566 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009567 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569 }
9570
9571 return 0;
9572}
9573
Alexander Belopolsky40018472011-02-26 01:02:56 +00009574Py_ssize_t
9575PyUnicode_Tailmatch(PyObject *str,
9576 PyObject *substr,
9577 Py_ssize_t start,
9578 Py_ssize_t end,
9579 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009581 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009582 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009583
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009584 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585}
9586
Guido van Rossumd57fd912000-03-10 22:53:23 +00009587/* Apply fixfct filter to the Unicode object self and return a
9588 reference to the modified object */
9589
Alexander Belopolsky40018472011-02-26 01:02:56 +00009590static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009591fixup(PyObject *self,
9592 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009593{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 PyObject *u;
9595 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009596 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009598 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009600 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009601 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 /* fix functions return the new maximum character in a string,
9604 if the kind of the resulting unicode object does not change,
9605 everything is fine. Otherwise we need to change the string kind
9606 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009607 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009608
9609 if (maxchar_new == 0) {
9610 /* no changes */;
9611 if (PyUnicode_CheckExact(self)) {
9612 Py_DECREF(u);
9613 Py_INCREF(self);
9614 return self;
9615 }
9616 else
9617 return u;
9618 }
9619
Victor Stinnere6abb482012-05-02 01:15:40 +02009620 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009621
Victor Stinnereaab6042011-12-11 22:22:39 +01009622 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009624
9625 /* In case the maximum character changed, we need to
9626 convert the string to the new category. */
9627 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9628 if (v == NULL) {
9629 Py_DECREF(u);
9630 return NULL;
9631 }
9632 if (maxchar_new > maxchar_old) {
9633 /* If the maxchar increased so that the kind changed, not all
9634 characters are representable anymore and we need to fix the
9635 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009636 _PyUnicode_FastCopyCharacters(v, 0,
9637 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009638 maxchar_old = fixfct(v);
9639 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640 }
9641 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009642 _PyUnicode_FastCopyCharacters(v, 0,
9643 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009645 Py_DECREF(u);
9646 assert(_PyUnicode_CheckConsistency(v, 1));
9647 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648}
9649
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009650static PyObject *
9651ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009653 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9654 char *resdata, *data = PyUnicode_DATA(self);
9655 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009656
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009657 res = PyUnicode_New(len, 127);
9658 if (res == NULL)
9659 return NULL;
9660 resdata = PyUnicode_DATA(res);
9661 if (lower)
9662 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009663 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009664 _Py_bytes_upper(resdata, data, len);
9665 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666}
9667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009669handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009670{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009671 Py_ssize_t j;
9672 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009673 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009674 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009675
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009676 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9677
9678 where ! is a negation and \p{xxx} is a character with property xxx.
9679 */
9680 for (j = i - 1; j >= 0; j--) {
9681 c = PyUnicode_READ(kind, data, j);
9682 if (!_PyUnicode_IsCaseIgnorable(c))
9683 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009685 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9686 if (final_sigma) {
9687 for (j = i + 1; j < length; j++) {
9688 c = PyUnicode_READ(kind, data, j);
9689 if (!_PyUnicode_IsCaseIgnorable(c))
9690 break;
9691 }
9692 final_sigma = j == length || !_PyUnicode_IsCased(c);
9693 }
9694 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695}
9696
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009697static int
9698lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9699 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009701 /* Obscure special case. */
9702 if (c == 0x3A3) {
9703 mapped[0] = handle_capital_sigma(kind, data, length, i);
9704 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009706 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009707}
9708
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009709static Py_ssize_t
9710do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009712 Py_ssize_t i, k = 0;
9713 int n_res, j;
9714 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009715
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009716 c = PyUnicode_READ(kind, data, 0);
9717 n_res = _PyUnicode_ToUpperFull(c, mapped);
9718 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009719 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009720 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009722 for (i = 1; i < length; i++) {
9723 c = PyUnicode_READ(kind, data, i);
9724 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9725 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009726 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009727 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009728 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009729 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009730 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731}
9732
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009733static Py_ssize_t
9734do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9735 Py_ssize_t i, k = 0;
9736
9737 for (i = 0; i < length; i++) {
9738 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9739 int n_res, j;
9740 if (Py_UNICODE_ISUPPER(c)) {
9741 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9742 }
9743 else if (Py_UNICODE_ISLOWER(c)) {
9744 n_res = _PyUnicode_ToUpperFull(c, mapped);
9745 }
9746 else {
9747 n_res = 1;
9748 mapped[0] = c;
9749 }
9750 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009751 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009752 res[k++] = mapped[j];
9753 }
9754 }
9755 return k;
9756}
9757
9758static Py_ssize_t
9759do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9760 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009762 Py_ssize_t i, k = 0;
9763
9764 for (i = 0; i < length; i++) {
9765 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9766 int n_res, j;
9767 if (lower)
9768 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9769 else
9770 n_res = _PyUnicode_ToUpperFull(c, mapped);
9771 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009772 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009773 res[k++] = mapped[j];
9774 }
9775 }
9776 return k;
9777}
9778
9779static Py_ssize_t
9780do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9781{
9782 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9783}
9784
9785static Py_ssize_t
9786do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9787{
9788 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9789}
9790
Benjamin Petersone51757f2012-01-12 21:10:29 -05009791static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009792do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9793{
9794 Py_ssize_t i, k = 0;
9795
9796 for (i = 0; i < length; i++) {
9797 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9798 Py_UCS4 mapped[3];
9799 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9800 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009801 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009802 res[k++] = mapped[j];
9803 }
9804 }
9805 return k;
9806}
9807
9808static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009809do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9810{
9811 Py_ssize_t i, k = 0;
9812 int previous_is_cased;
9813
9814 previous_is_cased = 0;
9815 for (i = 0; i < length; i++) {
9816 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9817 Py_UCS4 mapped[3];
9818 int n_res, j;
9819
9820 if (previous_is_cased)
9821 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9822 else
9823 n_res = _PyUnicode_ToTitleFull(c, mapped);
9824
9825 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009826 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009827 res[k++] = mapped[j];
9828 }
9829
9830 previous_is_cased = _PyUnicode_IsCased(c);
9831 }
9832 return k;
9833}
9834
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009835static PyObject *
9836case_operation(PyObject *self,
9837 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9838{
9839 PyObject *res = NULL;
9840 Py_ssize_t length, newlength = 0;
9841 int kind, outkind;
9842 void *data, *outdata;
9843 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9844
Benjamin Petersoneea48462012-01-16 14:28:50 -05009845 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009846
9847 kind = PyUnicode_KIND(self);
9848 data = PyUnicode_DATA(self);
9849 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009850 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009851 PyErr_SetString(PyExc_OverflowError, "string is too long");
9852 return NULL;
9853 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009854 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009855 if (tmp == NULL)
9856 return PyErr_NoMemory();
9857 newlength = perform(kind, data, length, tmp, &maxchar);
9858 res = PyUnicode_New(newlength, maxchar);
9859 if (res == NULL)
9860 goto leave;
9861 tmpend = tmp + newlength;
9862 outdata = PyUnicode_DATA(res);
9863 outkind = PyUnicode_KIND(res);
9864 switch (outkind) {
9865 case PyUnicode_1BYTE_KIND:
9866 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9867 break;
9868 case PyUnicode_2BYTE_KIND:
9869 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9870 break;
9871 case PyUnicode_4BYTE_KIND:
9872 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9873 break;
9874 default:
9875 assert(0);
9876 break;
9877 }
9878 leave:
9879 PyMem_FREE(tmp);
9880 return res;
9881}
9882
Tim Peters8ce9f162004-08-27 01:49:32 +00009883PyObject *
9884PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009886 PyObject *res;
9887 PyObject *fseq;
9888 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009889 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009890
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009891 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009892 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009893 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009894 }
9895
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009896 /* NOTE: the following code can't call back into Python code,
9897 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009898 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009899
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009900 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009901 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009902 res = _PyUnicode_JoinArray(separator, items, seqlen);
9903 Py_DECREF(fseq);
9904 return res;
9905}
9906
9907PyObject *
9908_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9909{
9910 PyObject *res = NULL; /* the result */
9911 PyObject *sep = NULL;
9912 Py_ssize_t seplen;
9913 PyObject *item;
9914 Py_ssize_t sz, i, res_offset;
9915 Py_UCS4 maxchar;
9916 Py_UCS4 item_maxchar;
9917 int use_memcpy;
9918 unsigned char *res_data = NULL, *sep_data = NULL;
9919 PyObject *last_obj;
9920 unsigned int kind = 0;
9921
Tim Peters05eba1f2004-08-27 21:32:02 +00009922 /* If empty sequence, return u"". */
9923 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009924 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009925 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009926
Tim Peters05eba1f2004-08-27 21:32:02 +00009927 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009928 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009929 if (seqlen == 1) {
9930 if (PyUnicode_CheckExact(items[0])) {
9931 res = items[0];
9932 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009933 return res;
9934 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009935 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009936 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009937 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009938 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009939 /* Set up sep and seplen */
9940 if (separator == NULL) {
9941 /* fall back to a blank space separator */
9942 sep = PyUnicode_FromOrdinal(' ');
9943 if (!sep)
9944 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009945 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009946 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009947 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009948 else {
9949 if (!PyUnicode_Check(separator)) {
9950 PyErr_Format(PyExc_TypeError,
9951 "separator: expected str instance,"
9952 " %.80s found",
9953 Py_TYPE(separator)->tp_name);
9954 goto onError;
9955 }
9956 if (PyUnicode_READY(separator))
9957 goto onError;
9958 sep = separator;
9959 seplen = PyUnicode_GET_LENGTH(separator);
9960 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9961 /* inc refcount to keep this code path symmetric with the
9962 above case of a blank separator */
9963 Py_INCREF(sep);
9964 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009965 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009966 }
9967
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009968 /* There are at least two things to join, or else we have a subclass
9969 * of str in the sequence.
9970 * Do a pre-pass to figure out the total amount of space we'll
9971 * need (sz), and see whether all argument are strings.
9972 */
9973 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009974#ifdef Py_DEBUG
9975 use_memcpy = 0;
9976#else
9977 use_memcpy = 1;
9978#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009979 for (i = 0; i < seqlen; i++) {
9980 const Py_ssize_t old_sz = sz;
9981 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009982 if (!PyUnicode_Check(item)) {
9983 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009984 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009985 " %.80s found",
9986 i, Py_TYPE(item)->tp_name);
9987 goto onError;
9988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009989 if (PyUnicode_READY(item) == -1)
9990 goto onError;
9991 sz += PyUnicode_GET_LENGTH(item);
9992 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009993 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009994 if (i != 0)
9995 sz += seplen;
9996 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9997 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009998 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009999 goto onError;
10000 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010001 if (use_memcpy && last_obj != NULL) {
10002 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10003 use_memcpy = 0;
10004 }
10005 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010006 }
Tim Petersced69f82003-09-16 20:30:58 +000010007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010009 if (res == NULL)
10010 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010011
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010012 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010013#ifdef Py_DEBUG
10014 use_memcpy = 0;
10015#else
10016 if (use_memcpy) {
10017 res_data = PyUnicode_1BYTE_DATA(res);
10018 kind = PyUnicode_KIND(res);
10019 if (seplen != 0)
10020 sep_data = PyUnicode_1BYTE_DATA(sep);
10021 }
10022#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010023 if (use_memcpy) {
10024 for (i = 0; i < seqlen; ++i) {
10025 Py_ssize_t itemlen;
10026 item = items[i];
10027
10028 /* Copy item, and maybe the separator. */
10029 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010030 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010031 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010032 kind * seplen);
10033 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010034 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010035
10036 itemlen = PyUnicode_GET_LENGTH(item);
10037 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010038 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010039 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010040 kind * itemlen);
10041 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010042 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010043 }
10044 assert(res_data == PyUnicode_1BYTE_DATA(res)
10045 + kind * PyUnicode_GET_LENGTH(res));
10046 }
10047 else {
10048 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10049 Py_ssize_t itemlen;
10050 item = items[i];
10051
10052 /* Copy item, and maybe the separator. */
10053 if (i && seplen != 0) {
10054 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10055 res_offset += seplen;
10056 }
10057
10058 itemlen = PyUnicode_GET_LENGTH(item);
10059 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010060 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010061 res_offset += itemlen;
10062 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010063 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010064 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010065 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010068 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010070
Benjamin Peterson29060642009-01-31 22:14:21 +000010071 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010073 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010074 return NULL;
10075}
10076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077#define FILL(kind, data, value, start, length) \
10078 do { \
10079 Py_ssize_t i_ = 0; \
10080 assert(kind != PyUnicode_WCHAR_KIND); \
10081 switch ((kind)) { \
10082 case PyUnicode_1BYTE_KIND: { \
10083 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010084 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 break; \
10086 } \
10087 case PyUnicode_2BYTE_KIND: { \
10088 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10089 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10090 break; \
10091 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010092 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10094 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10095 break; \
10096 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010097 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 } \
10099 } while (0)
10100
Victor Stinnerd3f08822012-05-29 12:57:52 +020010101void
10102_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10103 Py_UCS4 fill_char)
10104{
10105 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10106 const void *data = PyUnicode_DATA(unicode);
10107 assert(PyUnicode_IS_READY(unicode));
10108 assert(unicode_modifiable(unicode));
10109 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10110 assert(start >= 0);
10111 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10112 FILL(kind, data, fill_char, start, length);
10113}
10114
Victor Stinner3fe55312012-01-04 00:33:50 +010010115Py_ssize_t
10116PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10117 Py_UCS4 fill_char)
10118{
10119 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010120
10121 if (!PyUnicode_Check(unicode)) {
10122 PyErr_BadInternalCall();
10123 return -1;
10124 }
10125 if (PyUnicode_READY(unicode) == -1)
10126 return -1;
10127 if (unicode_check_modifiable(unicode))
10128 return -1;
10129
Victor Stinnerd3f08822012-05-29 12:57:52 +020010130 if (start < 0) {
10131 PyErr_SetString(PyExc_IndexError, "string index out of range");
10132 return -1;
10133 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010134 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10135 PyErr_SetString(PyExc_ValueError,
10136 "fill character is bigger than "
10137 "the string maximum character");
10138 return -1;
10139 }
10140
10141 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10142 length = Py_MIN(maxlen, length);
10143 if (length <= 0)
10144 return 0;
10145
Victor Stinnerd3f08822012-05-29 12:57:52 +020010146 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010147 return length;
10148}
10149
Victor Stinner9310abb2011-10-05 00:59:23 +020010150static PyObject *
10151pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010152 Py_ssize_t left,
10153 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010155{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 PyObject *u;
10157 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010158 int kind;
10159 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160
10161 if (left < 0)
10162 left = 0;
10163 if (right < 0)
10164 right = 0;
10165
Victor Stinnerc4b49542011-12-11 22:44:26 +010010166 if (left == 0 && right == 0)
10167 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10170 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010171 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10172 return NULL;
10173 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010175 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010177 if (!u)
10178 return NULL;
10179
10180 kind = PyUnicode_KIND(u);
10181 data = PyUnicode_DATA(u);
10182 if (left)
10183 FILL(kind, data, fill, 0, left);
10184 if (right)
10185 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010186 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010187 assert(_PyUnicode_CheckConsistency(u, 1));
10188 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010189}
10190
Alexander Belopolsky40018472011-02-26 01:02:56 +000010191PyObject *
10192PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010193{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010195
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010196 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010197 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198
Benjamin Petersonead6b532011-12-20 17:23:42 -060010199 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010201 if (PyUnicode_IS_ASCII(string))
10202 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010203 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010204 PyUnicode_GET_LENGTH(string), keepends);
10205 else
10206 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010207 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010208 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 break;
10210 case PyUnicode_2BYTE_KIND:
10211 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010212 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 PyUnicode_GET_LENGTH(string), keepends);
10214 break;
10215 case PyUnicode_4BYTE_KIND:
10216 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010217 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 PyUnicode_GET_LENGTH(string), keepends);
10219 break;
10220 default:
10221 assert(0);
10222 list = 0;
10223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225}
10226
Alexander Belopolsky40018472011-02-26 01:02:56 +000010227static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010228split(PyObject *self,
10229 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010230 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010232 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 void *buf1, *buf2;
10234 Py_ssize_t len1, len2;
10235 PyObject* out;
10236
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010238 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 if (PyUnicode_READY(self) == -1)
10241 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010244 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010246 if (PyUnicode_IS_ASCII(self))
10247 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010248 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010249 PyUnicode_GET_LENGTH(self), maxcount
10250 );
10251 else
10252 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010253 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010254 PyUnicode_GET_LENGTH(self), maxcount
10255 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 case PyUnicode_2BYTE_KIND:
10257 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010258 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 PyUnicode_GET_LENGTH(self), maxcount
10260 );
10261 case PyUnicode_4BYTE_KIND:
10262 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010263 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 PyUnicode_GET_LENGTH(self), maxcount
10265 );
10266 default:
10267 assert(0);
10268 return NULL;
10269 }
10270
10271 if (PyUnicode_READY(substring) == -1)
10272 return NULL;
10273
10274 kind1 = PyUnicode_KIND(self);
10275 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 len1 = PyUnicode_GET_LENGTH(self);
10277 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010278 if (kind1 < kind2 || len1 < len2) {
10279 out = PyList_New(1);
10280 if (out == NULL)
10281 return NULL;
10282 Py_INCREF(self);
10283 PyList_SET_ITEM(out, 0, self);
10284 return out;
10285 }
10286 buf1 = PyUnicode_DATA(self);
10287 buf2 = PyUnicode_DATA(substring);
10288 if (kind2 != kind1) {
10289 buf2 = _PyUnicode_AsKind(substring, kind1);
10290 if (!buf2)
10291 return NULL;
10292 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010294 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010296 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10297 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010298 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010299 else
10300 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010301 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 break;
10303 case PyUnicode_2BYTE_KIND:
10304 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010305 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 break;
10307 case PyUnicode_4BYTE_KIND:
10308 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010309 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 break;
10311 default:
10312 out = NULL;
10313 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010314 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 PyMem_Free(buf2);
10316 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317}
10318
Alexander Belopolsky40018472011-02-26 01:02:56 +000010319static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010320rsplit(PyObject *self,
10321 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010322 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010323{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010324 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 void *buf1, *buf2;
10326 Py_ssize_t len1, len2;
10327 PyObject* out;
10328
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010329 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010330 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 if (PyUnicode_READY(self) == -1)
10333 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010336 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010338 if (PyUnicode_IS_ASCII(self))
10339 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010340 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010341 PyUnicode_GET_LENGTH(self), maxcount
10342 );
10343 else
10344 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010345 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010346 PyUnicode_GET_LENGTH(self), maxcount
10347 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 case PyUnicode_2BYTE_KIND:
10349 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010350 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 PyUnicode_GET_LENGTH(self), maxcount
10352 );
10353 case PyUnicode_4BYTE_KIND:
10354 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010355 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 PyUnicode_GET_LENGTH(self), maxcount
10357 );
10358 default:
10359 assert(0);
10360 return NULL;
10361 }
10362
10363 if (PyUnicode_READY(substring) == -1)
10364 return NULL;
10365
10366 kind1 = PyUnicode_KIND(self);
10367 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 len1 = PyUnicode_GET_LENGTH(self);
10369 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010370 if (kind1 < kind2 || len1 < len2) {
10371 out = PyList_New(1);
10372 if (out == NULL)
10373 return NULL;
10374 Py_INCREF(self);
10375 PyList_SET_ITEM(out, 0, self);
10376 return out;
10377 }
10378 buf1 = PyUnicode_DATA(self);
10379 buf2 = PyUnicode_DATA(substring);
10380 if (kind2 != kind1) {
10381 buf2 = _PyUnicode_AsKind(substring, kind1);
10382 if (!buf2)
10383 return NULL;
10384 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010386 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010388 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10389 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010390 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010391 else
10392 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010393 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 break;
10395 case PyUnicode_2BYTE_KIND:
10396 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010397 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 break;
10399 case PyUnicode_4BYTE_KIND:
10400 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010401 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 break;
10403 default:
10404 out = NULL;
10405 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010406 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 PyMem_Free(buf2);
10408 return out;
10409}
10410
10411static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010412anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10413 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010415 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010417 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10418 return asciilib_find(buf1, len1, buf2, len2, offset);
10419 else
10420 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 case PyUnicode_2BYTE_KIND:
10422 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10423 case PyUnicode_4BYTE_KIND:
10424 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10425 }
10426 assert(0);
10427 return -1;
10428}
10429
10430static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010431anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10432 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010434 switch (kind) {
10435 case PyUnicode_1BYTE_KIND:
10436 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10437 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10438 else
10439 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10440 case PyUnicode_2BYTE_KIND:
10441 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10442 case PyUnicode_4BYTE_KIND:
10443 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10444 }
10445 assert(0);
10446 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010447}
10448
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010449static void
10450replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10451 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10452{
10453 int kind = PyUnicode_KIND(u);
10454 void *data = PyUnicode_DATA(u);
10455 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10456 if (kind == PyUnicode_1BYTE_KIND) {
10457 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10458 (Py_UCS1 *)data + len,
10459 u1, u2, maxcount);
10460 }
10461 else if (kind == PyUnicode_2BYTE_KIND) {
10462 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10463 (Py_UCS2 *)data + len,
10464 u1, u2, maxcount);
10465 }
10466 else {
10467 assert(kind == PyUnicode_4BYTE_KIND);
10468 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10469 (Py_UCS4 *)data + len,
10470 u1, u2, maxcount);
10471 }
10472}
10473
Alexander Belopolsky40018472011-02-26 01:02:56 +000010474static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475replace(PyObject *self, PyObject *str1,
10476 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 PyObject *u;
10479 char *sbuf = PyUnicode_DATA(self);
10480 char *buf1 = PyUnicode_DATA(str1);
10481 char *buf2 = PyUnicode_DATA(str2);
10482 int srelease = 0, release1 = 0, release2 = 0;
10483 int skind = PyUnicode_KIND(self);
10484 int kind1 = PyUnicode_KIND(str1);
10485 int kind2 = PyUnicode_KIND(str2);
10486 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10487 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10488 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010489 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010490 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491
10492 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010493 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010495 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496
Victor Stinner59de0ee2011-10-07 10:01:28 +020010497 if (str1 == str2)
10498 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499
Victor Stinner49a0a212011-10-12 23:46:10 +020010500 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010501 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10502 if (maxchar < maxchar_str1)
10503 /* substring too wide to be present */
10504 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010505 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10506 /* Replacing str1 with str2 may cause a maxchar reduction in the
10507 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010508 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010509 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010512 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010514 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010516 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010517 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010518 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010519
Victor Stinner69ed0f42013-04-09 21:48:24 +020010520 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010521 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010522 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010523 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010524 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010526 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010528
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010529 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10530 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010531 }
10532 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 int rkind = skind;
10534 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010535 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 if (kind1 < rkind) {
10538 /* widen substring */
10539 buf1 = _PyUnicode_AsKind(str1, rkind);
10540 if (!buf1) goto error;
10541 release1 = 1;
10542 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010543 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010544 if (i < 0)
10545 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 if (rkind > kind2) {
10547 /* widen replacement */
10548 buf2 = _PyUnicode_AsKind(str2, rkind);
10549 if (!buf2) goto error;
10550 release2 = 1;
10551 }
10552 else if (rkind < kind2) {
10553 /* widen self and buf1 */
10554 rkind = kind2;
10555 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010556 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 sbuf = _PyUnicode_AsKind(self, rkind);
10558 if (!sbuf) goto error;
10559 srelease = 1;
10560 buf1 = _PyUnicode_AsKind(str1, rkind);
10561 if (!buf1) goto error;
10562 release1 = 1;
10563 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010564 u = PyUnicode_New(slen, maxchar);
10565 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010567 assert(PyUnicode_KIND(u) == rkind);
10568 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010569
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010570 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010571 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010572 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010574 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010576
10577 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010578 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010579 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010580 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010581 if (i == -1)
10582 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010583 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010585 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010589 }
10590 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010592 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 int rkind = skind;
10594 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010597 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 buf1 = _PyUnicode_AsKind(str1, rkind);
10599 if (!buf1) goto error;
10600 release1 = 1;
10601 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010602 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010603 if (n == 0)
10604 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010606 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 buf2 = _PyUnicode_AsKind(str2, rkind);
10608 if (!buf2) goto error;
10609 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010612 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 rkind = kind2;
10614 sbuf = _PyUnicode_AsKind(self, rkind);
10615 if (!sbuf) goto error;
10616 srelease = 1;
10617 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010618 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 buf1 = _PyUnicode_AsKind(str1, rkind);
10620 if (!buf1) goto error;
10621 release1 = 1;
10622 }
10623 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10624 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010625 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 PyErr_SetString(PyExc_OverflowError,
10627 "replace string is too long");
10628 goto error;
10629 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010630 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010631 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010632 _Py_INCREF_UNICODE_EMPTY();
10633 if (!unicode_empty)
10634 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010635 u = unicode_empty;
10636 goto done;
10637 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010638 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 PyErr_SetString(PyExc_OverflowError,
10640 "replace string is too long");
10641 goto error;
10642 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010643 u = PyUnicode_New(new_size, maxchar);
10644 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010646 assert(PyUnicode_KIND(u) == rkind);
10647 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 ires = i = 0;
10649 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010650 while (n-- > 0) {
10651 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010652 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010653 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010654 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010655 if (j == -1)
10656 break;
10657 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010658 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010659 memcpy(res + rkind * ires,
10660 sbuf + rkind * i,
10661 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010663 }
10664 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010666 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010668 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010674 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010675 memcpy(res + rkind * ires,
10676 sbuf + rkind * i,
10677 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010678 }
10679 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010680 /* interleave */
10681 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010682 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010684 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010686 if (--n <= 0)
10687 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010688 memcpy(res + rkind * ires,
10689 sbuf + rkind * i,
10690 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 ires++;
10692 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010693 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010694 memcpy(res + rkind * ires,
10695 sbuf + rkind * i,
10696 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010697 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010698 }
10699
10700 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010701 unicode_adjust_maxchar(&u);
10702 if (u == NULL)
10703 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010705
10706 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 if (srelease)
10708 PyMem_FREE(sbuf);
10709 if (release1)
10710 PyMem_FREE(buf1);
10711 if (release2)
10712 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010713 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010715
Benjamin Peterson29060642009-01-31 22:14:21 +000010716 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010717 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 if (srelease)
10719 PyMem_FREE(sbuf);
10720 if (release1)
10721 PyMem_FREE(buf1);
10722 if (release2)
10723 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010724 return unicode_result_unchanged(self);
10725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 error:
10727 if (srelease && sbuf)
10728 PyMem_FREE(sbuf);
10729 if (release1 && buf1)
10730 PyMem_FREE(buf1);
10731 if (release2 && buf2)
10732 PyMem_FREE(buf2);
10733 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734}
10735
10736/* --- Unicode Object Methods --------------------------------------------- */
10737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010738PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010739 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740\n\
10741Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010742characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743
10744static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010745unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010747 if (PyUnicode_READY(self) == -1)
10748 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010749 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010750}
10751
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010752PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010753 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754\n\
10755Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010756have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757
10758static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010759unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010761 if (PyUnicode_READY(self) == -1)
10762 return NULL;
10763 if (PyUnicode_GET_LENGTH(self) == 0)
10764 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010765 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766}
10767
Benjamin Petersond5890c82012-01-14 13:23:30 -050010768PyDoc_STRVAR(casefold__doc__,
10769 "S.casefold() -> str\n\
10770\n\
10771Return a version of S suitable for caseless comparisons.");
10772
10773static PyObject *
10774unicode_casefold(PyObject *self)
10775{
10776 if (PyUnicode_READY(self) == -1)
10777 return NULL;
10778 if (PyUnicode_IS_ASCII(self))
10779 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010780 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010781}
10782
10783
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010784/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010785
10786static int
10787convert_uc(PyObject *obj, void *addr)
10788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010790
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010791 if (!PyUnicode_Check(obj)) {
10792 PyErr_Format(PyExc_TypeError,
10793 "The fill character must be a unicode character, "
10794 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010795 return 0;
10796 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010797 if (PyUnicode_READY(obj) < 0)
10798 return 0;
10799 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010800 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010801 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010802 return 0;
10803 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010804 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010805 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010806}
10807
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010808PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010809 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010810\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010811Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010812done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813
10814static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010815unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010817 Py_ssize_t marg, left;
10818 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 Py_UCS4 fillchar = ' ';
10820
Victor Stinnere9a29352011-10-01 02:14:59 +020010821 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823
Benjamin Petersonbac79492012-01-14 13:34:47 -050010824 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825 return NULL;
10826
Victor Stinnerc4b49542011-12-11 22:44:26 +010010827 if (PyUnicode_GET_LENGTH(self) >= width)
10828 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829
Victor Stinnerc4b49542011-12-11 22:44:26 +010010830 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831 left = marg / 2 + (marg & width & 1);
10832
Victor Stinner9310abb2011-10-05 00:59:23 +020010833 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834}
10835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836/* This function assumes that str1 and str2 are readied by the caller. */
10837
Marc-André Lemburge5034372000-08-08 08:04:29 +000010838static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010839unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010840{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010841#define COMPARE(TYPE1, TYPE2) \
10842 do { \
10843 TYPE1* p1 = (TYPE1 *)data1; \
10844 TYPE2* p2 = (TYPE2 *)data2; \
10845 TYPE1* end = p1 + len; \
10846 Py_UCS4 c1, c2; \
10847 for (; p1 != end; p1++, p2++) { \
10848 c1 = *p1; \
10849 c2 = *p2; \
10850 if (c1 != c2) \
10851 return (c1 < c2) ? -1 : 1; \
10852 } \
10853 } \
10854 while (0)
10855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010856 int kind1, kind2;
10857 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010858 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010860 kind1 = PyUnicode_KIND(str1);
10861 kind2 = PyUnicode_KIND(str2);
10862 data1 = PyUnicode_DATA(str1);
10863 data2 = PyUnicode_DATA(str2);
10864 len1 = PyUnicode_GET_LENGTH(str1);
10865 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010866 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010867
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010868 switch(kind1) {
10869 case PyUnicode_1BYTE_KIND:
10870 {
10871 switch(kind2) {
10872 case PyUnicode_1BYTE_KIND:
10873 {
10874 int cmp = memcmp(data1, data2, len);
10875 /* normalize result of memcmp() into the range [-1; 1] */
10876 if (cmp < 0)
10877 return -1;
10878 if (cmp > 0)
10879 return 1;
10880 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010881 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010882 case PyUnicode_2BYTE_KIND:
10883 COMPARE(Py_UCS1, Py_UCS2);
10884 break;
10885 case PyUnicode_4BYTE_KIND:
10886 COMPARE(Py_UCS1, Py_UCS4);
10887 break;
10888 default:
10889 assert(0);
10890 }
10891 break;
10892 }
10893 case PyUnicode_2BYTE_KIND:
10894 {
10895 switch(kind2) {
10896 case PyUnicode_1BYTE_KIND:
10897 COMPARE(Py_UCS2, Py_UCS1);
10898 break;
10899 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010900 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010901 COMPARE(Py_UCS2, Py_UCS2);
10902 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010903 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010904 case PyUnicode_4BYTE_KIND:
10905 COMPARE(Py_UCS2, Py_UCS4);
10906 break;
10907 default:
10908 assert(0);
10909 }
10910 break;
10911 }
10912 case PyUnicode_4BYTE_KIND:
10913 {
10914 switch(kind2) {
10915 case PyUnicode_1BYTE_KIND:
10916 COMPARE(Py_UCS4, Py_UCS1);
10917 break;
10918 case PyUnicode_2BYTE_KIND:
10919 COMPARE(Py_UCS4, Py_UCS2);
10920 break;
10921 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010922 {
10923#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10924 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10925 /* normalize result of wmemcmp() into the range [-1; 1] */
10926 if (cmp < 0)
10927 return -1;
10928 if (cmp > 0)
10929 return 1;
10930#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010931 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010932#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010933 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010934 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010935 default:
10936 assert(0);
10937 }
10938 break;
10939 }
10940 default:
10941 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010942 }
10943
Victor Stinner770e19e2012-10-04 22:59:45 +020010944 if (len1 == len2)
10945 return 0;
10946 if (len1 < len2)
10947 return -1;
10948 else
10949 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010950
10951#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010952}
10953
Benjamin Peterson621b4302016-09-09 13:54:34 -070010954static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010955unicode_compare_eq(PyObject *str1, PyObject *str2)
10956{
10957 int kind;
10958 void *data1, *data2;
10959 Py_ssize_t len;
10960 int cmp;
10961
Victor Stinnere5567ad2012-10-23 02:48:49 +020010962 len = PyUnicode_GET_LENGTH(str1);
10963 if (PyUnicode_GET_LENGTH(str2) != len)
10964 return 0;
10965 kind = PyUnicode_KIND(str1);
10966 if (PyUnicode_KIND(str2) != kind)
10967 return 0;
10968 data1 = PyUnicode_DATA(str1);
10969 data2 = PyUnicode_DATA(str2);
10970
10971 cmp = memcmp(data1, data2, len * kind);
10972 return (cmp == 0);
10973}
10974
10975
Alexander Belopolsky40018472011-02-26 01:02:56 +000010976int
10977PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10980 if (PyUnicode_READY(left) == -1 ||
10981 PyUnicode_READY(right) == -1)
10982 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010983
10984 /* a string is equal to itself */
10985 if (left == right)
10986 return 0;
10987
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010988 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010990 PyErr_Format(PyExc_TypeError,
10991 "Can't compare %.100s and %.100s",
10992 left->ob_type->tp_name,
10993 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994 return -1;
10995}
10996
Martin v. Löwis5b222132007-06-10 09:51:05 +000010997int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010998_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10999{
11000 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
11001 if (right_str == NULL)
11002 return -1;
11003 return PyUnicode_Compare(left, right_str);
11004}
11005
11006int
Martin v. Löwis5b222132007-06-10 09:51:05 +000011007PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11008{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 Py_ssize_t i;
11010 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 Py_UCS4 chr;
11012
Victor Stinner910337b2011-10-03 03:20:16 +020011013 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 if (PyUnicode_READY(uni) == -1)
11015 return -1;
11016 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011017 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011018 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011019 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011020 size_t len, len2 = strlen(str);
11021 int cmp;
11022
11023 len = Py_MIN(len1, len2);
11024 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011025 if (cmp != 0) {
11026 if (cmp < 0)
11027 return -1;
11028 else
11029 return 1;
11030 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011031 if (len1 > len2)
11032 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011033 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011034 return -1; /* str is longer */
11035 return 0;
11036 }
11037 else {
11038 void *data = PyUnicode_DATA(uni);
11039 /* Compare Unicode string and source character set string */
11040 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011041 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011042 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11043 /* This check keeps Python strings that end in '\0' from comparing equal
11044 to C strings identical up to that point. */
11045 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11046 return 1; /* uni is longer */
11047 if (str[i])
11048 return -1; /* str is longer */
11049 return 0;
11050 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011051}
11052
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011053
Benjamin Peterson29060642009-01-31 22:14:21 +000011054#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011055 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011056
Alexander Belopolsky40018472011-02-26 01:02:56 +000011057PyObject *
11058PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011059{
11060 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011061 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011062
Victor Stinnere5567ad2012-10-23 02:48:49 +020011063 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11064 Py_RETURN_NOTIMPLEMENTED;
11065
11066 if (PyUnicode_READY(left) == -1 ||
11067 PyUnicode_READY(right) == -1)
11068 return NULL;
11069
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011070 if (left == right) {
11071 switch (op) {
11072 case Py_EQ:
11073 case Py_LE:
11074 case Py_GE:
11075 /* a string is equal to itself */
11076 v = Py_True;
11077 break;
11078 case Py_NE:
11079 case Py_LT:
11080 case Py_GT:
11081 v = Py_False;
11082 break;
11083 default:
11084 PyErr_BadArgument();
11085 return NULL;
11086 }
11087 }
11088 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011089 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011090 result ^= (op == Py_NE);
11091 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011092 }
11093 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011094 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011095
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011096 /* Convert the return value to a Boolean */
11097 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011098 case Py_LE:
11099 v = TEST_COND(result <= 0);
11100 break;
11101 case Py_GE:
11102 v = TEST_COND(result >= 0);
11103 break;
11104 case Py_LT:
11105 v = TEST_COND(result == -1);
11106 break;
11107 case Py_GT:
11108 v = TEST_COND(result == 1);
11109 break;
11110 default:
11111 PyErr_BadArgument();
11112 return NULL;
11113 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011114 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011115 Py_INCREF(v);
11116 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011117}
11118
Alexander Belopolsky40018472011-02-26 01:02:56 +000011119int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011120_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11121{
11122 return unicode_eq(aa, bb);
11123}
11124
11125int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011126PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011127{
Victor Stinner77282cb2013-04-14 19:22:47 +020011128 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011129 void *buf1, *buf2;
11130 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011131 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011132
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011133 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011134 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011135 "'in <string>' requires string as left operand, not %.100s",
11136 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011137 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011138 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011139 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011140 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011141 if (ensure_unicode(str) < 0)
11142 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011145 kind2 = PyUnicode_KIND(substr);
11146 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011147 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011149 len2 = PyUnicode_GET_LENGTH(substr);
11150 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011151 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011152 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011153 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011154 if (len2 == 1) {
11155 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11156 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011157 return result;
11158 }
11159 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011160 buf2 = _PyUnicode_AsKind(substr, kind1);
11161 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011162 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011163 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164
Victor Stinner77282cb2013-04-14 19:22:47 +020011165 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011166 case PyUnicode_1BYTE_KIND:
11167 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11168 break;
11169 case PyUnicode_2BYTE_KIND:
11170 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11171 break;
11172 case PyUnicode_4BYTE_KIND:
11173 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11174 break;
11175 default:
11176 result = -1;
11177 assert(0);
11178 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011179
Victor Stinner77282cb2013-04-14 19:22:47 +020011180 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 PyMem_Free(buf2);
11182
Guido van Rossum403d68b2000-03-13 15:55:09 +000011183 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011184}
11185
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186/* Concat to string or Unicode object giving a new Unicode object. */
11187
Alexander Belopolsky40018472011-02-26 01:02:56 +000011188PyObject *
11189PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011191 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011192 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011193 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011195 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11196 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197
11198 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011199 if (left == unicode_empty)
11200 return PyUnicode_FromObject(right);
11201 if (right == unicode_empty)
11202 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011204 left_len = PyUnicode_GET_LENGTH(left);
11205 right_len = PyUnicode_GET_LENGTH(right);
11206 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011207 PyErr_SetString(PyExc_OverflowError,
11208 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011209 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011210 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011211 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011212
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011213 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11214 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011215 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011218 result = PyUnicode_New(new_len, maxchar);
11219 if (result == NULL)
11220 return NULL;
11221 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11222 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11223 assert(_PyUnicode_CheckConsistency(result, 1));
11224 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225}
11226
Walter Dörwald1ab83302007-05-18 17:15:44 +000011227void
Victor Stinner23e56682011-10-03 03:54:37 +020011228PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011229{
Victor Stinner23e56682011-10-03 03:54:37 +020011230 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011231 Py_UCS4 maxchar, maxchar2;
11232 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011233
11234 if (p_left == NULL) {
11235 if (!PyErr_Occurred())
11236 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011237 return;
11238 }
Victor Stinner23e56682011-10-03 03:54:37 +020011239 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011240 if (right == NULL || left == NULL
11241 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011242 if (!PyErr_Occurred())
11243 PyErr_BadInternalCall();
11244 goto error;
11245 }
11246
Benjamin Petersonbac79492012-01-14 13:34:47 -050011247 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011248 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011249 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011250 goto error;
11251
Victor Stinner488fa492011-12-12 00:01:39 +010011252 /* Shortcuts */
11253 if (left == unicode_empty) {
11254 Py_DECREF(left);
11255 Py_INCREF(right);
11256 *p_left = right;
11257 return;
11258 }
11259 if (right == unicode_empty)
11260 return;
11261
11262 left_len = PyUnicode_GET_LENGTH(left);
11263 right_len = PyUnicode_GET_LENGTH(right);
11264 if (left_len > PY_SSIZE_T_MAX - right_len) {
11265 PyErr_SetString(PyExc_OverflowError,
11266 "strings are too large to concat");
11267 goto error;
11268 }
11269 new_len = left_len + right_len;
11270
11271 if (unicode_modifiable(left)
11272 && PyUnicode_CheckExact(right)
11273 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011274 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11275 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011276 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011277 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011278 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11279 {
11280 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011281 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011282 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011283
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011284 /* copy 'right' into the newly allocated area of 'left' */
11285 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011286 }
Victor Stinner488fa492011-12-12 00:01:39 +010011287 else {
11288 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11289 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011290 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011291
Victor Stinner488fa492011-12-12 00:01:39 +010011292 /* Concat the two Unicode strings */
11293 res = PyUnicode_New(new_len, maxchar);
11294 if (res == NULL)
11295 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011296 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11297 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011298 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011299 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011300 }
11301 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011302 return;
11303
11304error:
Victor Stinner488fa492011-12-12 00:01:39 +010011305 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011306}
11307
11308void
11309PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11310{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011311 PyUnicode_Append(pleft, right);
11312 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011313}
11314
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011315/*
11316Wraps stringlib_parse_args_finds() and additionally ensures that the
11317first argument is a unicode object.
11318*/
11319
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011320static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011321parse_args_finds_unicode(const char * function_name, PyObject *args,
11322 PyObject **substring,
11323 Py_ssize_t *start, Py_ssize_t *end)
11324{
11325 if(stringlib_parse_args_finds(function_name, args, substring,
11326 start, end)) {
11327 if (ensure_unicode(*substring) < 0)
11328 return 0;
11329 return 1;
11330 }
11331 return 0;
11332}
11333
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011334PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011335 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011337Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011338string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011339interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340
11341static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011342unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011343{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011344 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011345 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011346 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011348 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 void *buf1, *buf2;
11350 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011352 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011353 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011354
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 kind1 = PyUnicode_KIND(self);
11356 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011357 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011358 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 len1 = PyUnicode_GET_LENGTH(self);
11361 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011363 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011364 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011365
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011366 buf1 = PyUnicode_DATA(self);
11367 buf2 = PyUnicode_DATA(substring);
11368 if (kind2 != kind1) {
11369 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011370 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011371 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011372 }
11373 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 case PyUnicode_1BYTE_KIND:
11375 iresult = ucs1lib_count(
11376 ((Py_UCS1*)buf1) + start, end - start,
11377 buf2, len2, PY_SSIZE_T_MAX
11378 );
11379 break;
11380 case PyUnicode_2BYTE_KIND:
11381 iresult = ucs2lib_count(
11382 ((Py_UCS2*)buf1) + start, end - start,
11383 buf2, len2, PY_SSIZE_T_MAX
11384 );
11385 break;
11386 case PyUnicode_4BYTE_KIND:
11387 iresult = ucs4lib_count(
11388 ((Py_UCS4*)buf1) + start, end - start,
11389 buf2, len2, PY_SSIZE_T_MAX
11390 );
11391 break;
11392 default:
11393 assert(0); iresult = 0;
11394 }
11395
11396 result = PyLong_FromSsize_t(iresult);
11397
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011398 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011400
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401 return result;
11402}
11403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011404PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011405 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011407Encode S using the codec registered for encoding. Default encoding\n\
11408is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011409handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011410a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11411'xmlcharrefreplace' as well as any other name registered with\n\
11412codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413
11414static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011415unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011417 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418 char *encoding = NULL;
11419 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011420
Benjamin Peterson308d6372009-09-18 21:42:35 +000011421 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11422 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011424 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011425}
11426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011427PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011428 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429\n\
11430Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011431If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432
11433static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011434unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011436 Py_ssize_t i, j, line_pos, src_len, incr;
11437 Py_UCS4 ch;
11438 PyObject *u;
11439 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011440 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011442 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011443 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444
Ezio Melotti745d54d2013-11-16 19:10:57 +020011445 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11446 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011447 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448
Antoine Pitrou22425222011-10-04 19:10:51 +020011449 if (PyUnicode_READY(self) == -1)
11450 return NULL;
11451
Thomas Wouters7e474022000-07-16 12:04:32 +000011452 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011453 src_len = PyUnicode_GET_LENGTH(self);
11454 i = j = line_pos = 0;
11455 kind = PyUnicode_KIND(self);
11456 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011457 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011458 for (; i < src_len; i++) {
11459 ch = PyUnicode_READ(kind, src_data, i);
11460 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011461 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011462 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011463 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011464 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011465 goto overflow;
11466 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011467 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011468 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011469 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011471 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011472 goto overflow;
11473 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011475 if (ch == '\n' || ch == '\r')
11476 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011478 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011479 if (!found)
11480 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011481
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011483 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484 if (!u)
11485 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011486 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487
Antoine Pitroue71d5742011-10-04 15:55:09 +020011488 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489
Antoine Pitroue71d5742011-10-04 15:55:09 +020011490 for (; i < src_len; i++) {
11491 ch = PyUnicode_READ(kind, src_data, i);
11492 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011493 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011494 incr = tabsize - (line_pos % tabsize);
11495 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011496 FILL(kind, dest_data, ' ', j, incr);
11497 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011498 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011499 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011500 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011501 line_pos++;
11502 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011503 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011504 if (ch == '\n' || ch == '\r')
11505 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011507 }
11508 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011509 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011510
Antoine Pitroue71d5742011-10-04 15:55:09 +020011511 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011512 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514}
11515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011516PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011517 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518\n\
11519Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011520such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521arguments start and end are interpreted as in slice notation.\n\
11522\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011523Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524
11525static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011528 /* initialize variables to prevent gcc warning */
11529 PyObject *substring = NULL;
11530 Py_ssize_t start = 0;
11531 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011532 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011534 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011537 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011540 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542 if (result == -2)
11543 return NULL;
11544
Christian Heimes217cfd12007-12-02 14:31:20 +000011545 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546}
11547
11548static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011549unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011551 void *data;
11552 enum PyUnicode_Kind kind;
11553 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011554
11555 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11556 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011558 }
11559 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11560 PyErr_SetString(PyExc_IndexError, "string index out of range");
11561 return NULL;
11562 }
11563 kind = PyUnicode_KIND(self);
11564 data = PyUnicode_DATA(self);
11565 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011566 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567}
11568
Guido van Rossumc2504932007-09-18 19:42:40 +000011569/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011570 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011571static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011572unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573{
Guido van Rossumc2504932007-09-18 19:42:40 +000011574 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011575 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011576
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011577#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011578 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011579#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580 if (_PyUnicode_HASH(self) != -1)
11581 return _PyUnicode_HASH(self);
11582 if (PyUnicode_READY(self) == -1)
11583 return -1;
11584 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011585 /*
11586 We make the hash of the empty string be 0, rather than using
11587 (prefix ^ suffix), since this slightly obfuscates the hash secret
11588 */
11589 if (len == 0) {
11590 _PyUnicode_HASH(self) = 0;
11591 return 0;
11592 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011593 x = _Py_HashBytes(PyUnicode_DATA(self),
11594 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011595 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011596 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597}
11598
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011599PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011600 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011602Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603
11604static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011607 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011608 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011609 PyObject *substring = NULL;
11610 Py_ssize_t start = 0;
11611 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011613 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011616 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011618
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011619 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 if (result == -2)
11622 return NULL;
11623
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624 if (result < 0) {
11625 PyErr_SetString(PyExc_ValueError, "substring not found");
11626 return NULL;
11627 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011628
Christian Heimes217cfd12007-12-02 14:31:20 +000011629 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630}
11631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011632PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011635Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011636at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637
11638static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011639unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 Py_ssize_t i, length;
11642 int kind;
11643 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644 int cased;
11645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 if (PyUnicode_READY(self) == -1)
11647 return NULL;
11648 length = PyUnicode_GET_LENGTH(self);
11649 kind = PyUnicode_KIND(self);
11650 data = PyUnicode_DATA(self);
11651
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011653 if (length == 1)
11654 return PyBool_FromLong(
11655 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011657 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011659 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011660
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 for (i = 0; i < length; i++) {
11663 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011664
Benjamin Peterson29060642009-01-31 22:14:21 +000011665 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11666 return PyBool_FromLong(0);
11667 else if (!cased && Py_UNICODE_ISLOWER(ch))
11668 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011670 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671}
11672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011673PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011674 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011676Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011677at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678
11679static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011680unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 Py_ssize_t i, length;
11683 int kind;
11684 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685 int cased;
11686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 if (PyUnicode_READY(self) == -1)
11688 return NULL;
11689 length = PyUnicode_GET_LENGTH(self);
11690 kind = PyUnicode_KIND(self);
11691 data = PyUnicode_DATA(self);
11692
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 if (length == 1)
11695 return PyBool_FromLong(
11696 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011698 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011700 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011701
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 for (i = 0; i < length; i++) {
11704 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011705
Benjamin Peterson29060642009-01-31 22:14:21 +000011706 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11707 return PyBool_FromLong(0);
11708 else if (!cased && Py_UNICODE_ISUPPER(ch))
11709 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011711 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712}
11713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011714PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011717Return True if S is a titlecased string and there is at least one\n\
11718character in S, i.e. upper- and titlecase characters may only\n\
11719follow uncased characters and lowercase characters only cased ones.\n\
11720Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721
11722static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011723unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011725 Py_ssize_t i, length;
11726 int kind;
11727 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728 int cased, previous_is_cased;
11729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730 if (PyUnicode_READY(self) == -1)
11731 return NULL;
11732 length = PyUnicode_GET_LENGTH(self);
11733 kind = PyUnicode_KIND(self);
11734 data = PyUnicode_DATA(self);
11735
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 if (length == 1) {
11738 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11739 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11740 (Py_UNICODE_ISUPPER(ch) != 0));
11741 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011743 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011746
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747 cased = 0;
11748 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 for (i = 0; i < length; i++) {
11750 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011751
Benjamin Peterson29060642009-01-31 22:14:21 +000011752 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11753 if (previous_is_cased)
11754 return PyBool_FromLong(0);
11755 previous_is_cased = 1;
11756 cased = 1;
11757 }
11758 else if (Py_UNICODE_ISLOWER(ch)) {
11759 if (!previous_is_cased)
11760 return PyBool_FromLong(0);
11761 previous_is_cased = 1;
11762 cased = 1;
11763 }
11764 else
11765 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011767 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768}
11769
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011770PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011771 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011773Return True if all characters in S are whitespace\n\
11774and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775
11776static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011777unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 Py_ssize_t i, length;
11780 int kind;
11781 void *data;
11782
11783 if (PyUnicode_READY(self) == -1)
11784 return NULL;
11785 length = PyUnicode_GET_LENGTH(self);
11786 kind = PyUnicode_KIND(self);
11787 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 if (length == 1)
11791 return PyBool_FromLong(
11792 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011794 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011796 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011798 for (i = 0; i < length; i++) {
11799 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011800 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011801 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011803 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804}
11805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011806PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011807 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011808\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011809Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011810and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011811
11812static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011813unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011814{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 Py_ssize_t i, length;
11816 int kind;
11817 void *data;
11818
11819 if (PyUnicode_READY(self) == -1)
11820 return NULL;
11821 length = PyUnicode_GET_LENGTH(self);
11822 kind = PyUnicode_KIND(self);
11823 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011824
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011825 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 if (length == 1)
11827 return PyBool_FromLong(
11828 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011829
11830 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011832 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011834 for (i = 0; i < length; i++) {
11835 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011836 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011837 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011838 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011839}
11840
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011841PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011842 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011843\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011844Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011845and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011846
11847static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011848unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011849{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 int kind;
11851 void *data;
11852 Py_ssize_t len, i;
11853
11854 if (PyUnicode_READY(self) == -1)
11855 return NULL;
11856
11857 kind = PyUnicode_KIND(self);
11858 data = PyUnicode_DATA(self);
11859 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011860
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011861 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 if (len == 1) {
11863 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11864 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11865 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011866
11867 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011869 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 for (i = 0; i < len; i++) {
11872 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011873 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011874 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011875 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011876 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011877}
11878
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011879PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011880 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011882Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011883False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884
11885static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011886unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 Py_ssize_t i, length;
11889 int kind;
11890 void *data;
11891
11892 if (PyUnicode_READY(self) == -1)
11893 return NULL;
11894 length = PyUnicode_GET_LENGTH(self);
11895 kind = PyUnicode_KIND(self);
11896 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 if (length == 1)
11900 return PyBool_FromLong(
11901 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011903 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011904 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011905 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011907 for (i = 0; i < length; i++) {
11908 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011909 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011911 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912}
11913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011914PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011915 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011917Return True if all characters in S are digits\n\
11918and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919
11920static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011921unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 Py_ssize_t i, length;
11924 int kind;
11925 void *data;
11926
11927 if (PyUnicode_READY(self) == -1)
11928 return NULL;
11929 length = PyUnicode_GET_LENGTH(self);
11930 kind = PyUnicode_KIND(self);
11931 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 if (length == 1) {
11935 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11936 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11937 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011939 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 for (i = 0; i < length; i++) {
11944 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011945 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011947 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948}
11949
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011950PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011951 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011953Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011954False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955
11956static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011957unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 Py_ssize_t i, length;
11960 int kind;
11961 void *data;
11962
11963 if (PyUnicode_READY(self) == -1)
11964 return NULL;
11965 length = PyUnicode_GET_LENGTH(self);
11966 kind = PyUnicode_KIND(self);
11967 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 if (length == 1)
11971 return PyBool_FromLong(
11972 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011974 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011976 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 for (i = 0; i < length; i++) {
11979 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011980 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011982 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983}
11984
Martin v. Löwis47383402007-08-15 07:32:56 +000011985int
11986PyUnicode_IsIdentifier(PyObject *self)
11987{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 int kind;
11989 void *data;
11990 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011991 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 if (PyUnicode_READY(self) == -1) {
11994 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011995 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 }
11997
11998 /* Special case for empty strings */
11999 if (PyUnicode_GET_LENGTH(self) == 0)
12000 return 0;
12001 kind = PyUnicode_KIND(self);
12002 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012003
12004 /* PEP 3131 says that the first character must be in
12005 XID_Start and subsequent characters in XID_Continue,
12006 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012007 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012008 letters, digits, underscore). However, given the current
12009 definition of XID_Start and XID_Continue, it is sufficient
12010 to check just for these, except that _ must be allowed
12011 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012013 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012014 return 0;
12015
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012016 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012018 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012019 return 1;
12020}
12021
12022PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012023 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000012024\n\
12025Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012026to the language definition.\n\
12027\n\
12028Use keyword.iskeyword() to test for reserved identifiers\n\
12029such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012030
12031static PyObject*
12032unicode_isidentifier(PyObject *self)
12033{
12034 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12035}
12036
Georg Brandl559e5d72008-06-11 18:37:52 +000012037PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012038 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012039\n\
12040Return True if all characters in S are considered\n\
12041printable in repr() or S is empty, False otherwise.");
12042
12043static PyObject*
12044unicode_isprintable(PyObject *self)
12045{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046 Py_ssize_t i, length;
12047 int kind;
12048 void *data;
12049
12050 if (PyUnicode_READY(self) == -1)
12051 return NULL;
12052 length = PyUnicode_GET_LENGTH(self);
12053 kind = PyUnicode_KIND(self);
12054 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012055
12056 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 if (length == 1)
12058 return PyBool_FromLong(
12059 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 for (i = 0; i < length; i++) {
12062 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012063 Py_RETURN_FALSE;
12064 }
12065 }
12066 Py_RETURN_TRUE;
12067}
12068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012069PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012070 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071\n\
12072Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012073iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074
12075static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012076unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012078 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079}
12080
Martin v. Löwis18e16552006-02-15 17:27:45 +000012081static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012082unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012084 if (PyUnicode_READY(self) == -1)
12085 return -1;
12086 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087}
12088
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012089PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012090 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012092Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012093done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094
12095static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012096unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012098 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 Py_UCS4 fillchar = ' ';
12100
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012101 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102 return NULL;
12103
Benjamin Petersonbac79492012-01-14 13:34:47 -050012104 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012105 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106
Victor Stinnerc4b49542011-12-11 22:44:26 +010012107 if (PyUnicode_GET_LENGTH(self) >= width)
12108 return unicode_result_unchanged(self);
12109
12110 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111}
12112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012113PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012114 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012116Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117
12118static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012119unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012121 if (PyUnicode_READY(self) == -1)
12122 return NULL;
12123 if (PyUnicode_IS_ASCII(self))
12124 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012125 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126}
12127
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012128#define LEFTSTRIP 0
12129#define RIGHTSTRIP 1
12130#define BOTHSTRIP 2
12131
12132/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012133static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012134
12135#define STRIPNAME(i) (stripformat[i]+3)
12136
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012137/* externally visible for str.strip(unicode) */
12138PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012139_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012140{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 void *data;
12142 int kind;
12143 Py_ssize_t i, j, len;
12144 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012145 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012147 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12148 return NULL;
12149
12150 kind = PyUnicode_KIND(self);
12151 data = PyUnicode_DATA(self);
12152 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012153 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12155 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012156 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012157
Benjamin Peterson14339b62009-01-31 16:36:08 +000012158 i = 0;
12159 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012160 while (i < len) {
12161 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12162 if (!BLOOM(sepmask, ch))
12163 break;
12164 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12165 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012166 i++;
12167 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012168 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012169
Benjamin Peterson14339b62009-01-31 16:36:08 +000012170 j = len;
12171 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012172 j--;
12173 while (j >= i) {
12174 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12175 if (!BLOOM(sepmask, ch))
12176 break;
12177 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12178 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012179 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012180 }
12181
Benjamin Peterson29060642009-01-31 22:14:21 +000012182 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012183 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012184
Victor Stinner7931d9a2011-11-04 00:22:48 +010012185 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012186}
12187
12188PyObject*
12189PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12190{
12191 unsigned char *data;
12192 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012193 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194
Victor Stinnerde636f32011-10-01 03:55:54 +020012195 if (PyUnicode_READY(self) == -1)
12196 return NULL;
12197
Victor Stinner684d5fd2012-05-03 02:32:34 +020012198 length = PyUnicode_GET_LENGTH(self);
12199 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012200
Victor Stinner684d5fd2012-05-03 02:32:34 +020012201 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012202 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203
Victor Stinnerde636f32011-10-01 03:55:54 +020012204 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012205 PyErr_SetString(PyExc_IndexError, "string index out of range");
12206 return NULL;
12207 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012208 if (start >= length || end < start)
12209 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012210
Victor Stinner684d5fd2012-05-03 02:32:34 +020012211 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012212 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012213 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012214 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012215 }
12216 else {
12217 kind = PyUnicode_KIND(self);
12218 data = PyUnicode_1BYTE_DATA(self);
12219 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012220 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012221 length);
12222 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224
12225static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012226do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012228 Py_ssize_t len, i, j;
12229
12230 if (PyUnicode_READY(self) == -1)
12231 return NULL;
12232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012233 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012234
Victor Stinnercc7af722013-04-09 22:39:24 +020012235 if (PyUnicode_IS_ASCII(self)) {
12236 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12237
12238 i = 0;
12239 if (striptype != RIGHTSTRIP) {
12240 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012241 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012242 if (!_Py_ascii_whitespace[ch])
12243 break;
12244 i++;
12245 }
12246 }
12247
12248 j = len;
12249 if (striptype != LEFTSTRIP) {
12250 j--;
12251 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012252 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012253 if (!_Py_ascii_whitespace[ch])
12254 break;
12255 j--;
12256 }
12257 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012258 }
12259 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012260 else {
12261 int kind = PyUnicode_KIND(self);
12262 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012263
Victor Stinnercc7af722013-04-09 22:39:24 +020012264 i = 0;
12265 if (striptype != RIGHTSTRIP) {
12266 while (i < len) {
12267 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12268 if (!Py_UNICODE_ISSPACE(ch))
12269 break;
12270 i++;
12271 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012272 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012273
12274 j = len;
12275 if (striptype != LEFTSTRIP) {
12276 j--;
12277 while (j >= i) {
12278 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12279 if (!Py_UNICODE_ISSPACE(ch))
12280 break;
12281 j--;
12282 }
12283 j++;
12284 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012285 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012286
Victor Stinner7931d9a2011-11-04 00:22:48 +010012287 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288}
12289
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012290
12291static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012292do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012293{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012294 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012295
Serhiy Storchakac6792272013-10-19 21:03:34 +030012296 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012297 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012298
Benjamin Peterson14339b62009-01-31 16:36:08 +000012299 if (sep != NULL && sep != Py_None) {
12300 if (PyUnicode_Check(sep))
12301 return _PyUnicode_XStrip(self, striptype, sep);
12302 else {
12303 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 "%s arg must be None or str",
12305 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012306 return NULL;
12307 }
12308 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012309
Benjamin Peterson14339b62009-01-31 16:36:08 +000012310 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012311}
12312
12313
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012314PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012315 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012316\n\
12317Return a copy of the string S with leading and trailing\n\
12318whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012319If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012320
12321static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012322unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012323{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012324 if (PyTuple_GET_SIZE(args) == 0)
12325 return do_strip(self, BOTHSTRIP); /* Common case */
12326 else
12327 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012328}
12329
12330
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012331PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012332 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012333\n\
12334Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012335If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012336
12337static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012338unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012339{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012340 if (PyTuple_GET_SIZE(args) == 0)
12341 return do_strip(self, LEFTSTRIP); /* Common case */
12342 else
12343 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012344}
12345
12346
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012347PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012348 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012349\n\
12350Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012351If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012352
12353static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012354unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012355{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012356 if (PyTuple_GET_SIZE(args) == 0)
12357 return do_strip(self, RIGHTSTRIP); /* Common case */
12358 else
12359 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012360}
12361
12362
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012364unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012366 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368
Serhiy Storchaka05997252013-01-26 12:14:02 +020012369 if (len < 1)
12370 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012371
Victor Stinnerc4b49542011-12-11 22:44:26 +010012372 /* no repeat, return original string */
12373 if (len == 1)
12374 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012375
Benjamin Petersonbac79492012-01-14 13:34:47 -050012376 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012377 return NULL;
12378
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012379 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012380 PyErr_SetString(PyExc_OverflowError,
12381 "repeated string is too long");
12382 return NULL;
12383 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012385
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012386 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387 if (!u)
12388 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012389 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391 if (PyUnicode_GET_LENGTH(str) == 1) {
12392 const int kind = PyUnicode_KIND(str);
12393 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012394 if (kind == PyUnicode_1BYTE_KIND) {
12395 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012396 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012397 }
12398 else if (kind == PyUnicode_2BYTE_KIND) {
12399 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012400 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012401 ucs2[n] = fill_char;
12402 } else {
12403 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12404 assert(kind == PyUnicode_4BYTE_KIND);
12405 for (n = 0; n < len; ++n)
12406 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012408 }
12409 else {
12410 /* number of characters copied this far */
12411 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012412 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012413 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012414 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012416 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012418 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012419 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012420 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012421 }
12422
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012423 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012424 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012425}
12426
Alexander Belopolsky40018472011-02-26 01:02:56 +000012427PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012428PyUnicode_Replace(PyObject *str,
12429 PyObject *substr,
12430 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012431 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012432{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012433 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12434 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012435 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012436 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012437}
12438
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012439PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012440 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012441\n\
12442Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012443old replaced by new. If the optional argument count is\n\
12444given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445
12446static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012448{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012449 PyObject *str1;
12450 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012451 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012452
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012453 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012454 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012455 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012456 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012457 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012458}
12459
Alexander Belopolsky40018472011-02-26 01:02:56 +000012460static PyObject *
12461unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012462{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012463 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464 Py_ssize_t isize;
12465 Py_ssize_t osize, squote, dquote, i, o;
12466 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012467 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012468 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012470 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012471 return NULL;
12472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 isize = PyUnicode_GET_LENGTH(unicode);
12474 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 /* Compute length of output, quote characters, and
12477 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012478 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 max = 127;
12480 squote = dquote = 0;
12481 ikind = PyUnicode_KIND(unicode);
12482 for (i = 0; i < isize; i++) {
12483 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012484 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012485 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012486 case '\'': squote++; break;
12487 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012488 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012489 incr = 2;
12490 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012491 default:
12492 /* Fast-path ASCII */
12493 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012494 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012495 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012496 ;
12497 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012500 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012501 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012502 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012503 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012504 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012506 if (osize > PY_SSIZE_T_MAX - incr) {
12507 PyErr_SetString(PyExc_OverflowError,
12508 "string is too long to generate repr");
12509 return NULL;
12510 }
12511 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 }
12513
12514 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012515 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012516 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012517 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 if (dquote)
12519 /* Both squote and dquote present. Use squote,
12520 and escape them */
12521 osize += squote;
12522 else
12523 quote = '"';
12524 }
Victor Stinner55c08782013-04-14 18:45:39 +020012525 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526
12527 repr = PyUnicode_New(osize, max);
12528 if (repr == NULL)
12529 return NULL;
12530 okind = PyUnicode_KIND(repr);
12531 odata = PyUnicode_DATA(repr);
12532
12533 PyUnicode_WRITE(okind, odata, 0, quote);
12534 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012535 if (unchanged) {
12536 _PyUnicode_FastCopyCharacters(repr, 1,
12537 unicode, 0,
12538 isize);
12539 }
12540 else {
12541 for (i = 0, o = 1; i < isize; i++) {
12542 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543
Victor Stinner55c08782013-04-14 18:45:39 +020012544 /* Escape quotes and backslashes */
12545 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012546 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012548 continue;
12549 }
12550
12551 /* Map special whitespace to '\t', \n', '\r' */
12552 if (ch == '\t') {
12553 PyUnicode_WRITE(okind, odata, o++, '\\');
12554 PyUnicode_WRITE(okind, odata, o++, 't');
12555 }
12556 else if (ch == '\n') {
12557 PyUnicode_WRITE(okind, odata, o++, '\\');
12558 PyUnicode_WRITE(okind, odata, o++, 'n');
12559 }
12560 else if (ch == '\r') {
12561 PyUnicode_WRITE(okind, odata, o++, '\\');
12562 PyUnicode_WRITE(okind, odata, o++, 'r');
12563 }
12564
12565 /* Map non-printable US ASCII to '\xhh' */
12566 else if (ch < ' ' || ch == 0x7F) {
12567 PyUnicode_WRITE(okind, odata, o++, '\\');
12568 PyUnicode_WRITE(okind, odata, o++, 'x');
12569 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12570 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12571 }
12572
12573 /* Copy ASCII characters as-is */
12574 else if (ch < 0x7F) {
12575 PyUnicode_WRITE(okind, odata, o++, ch);
12576 }
12577
12578 /* Non-ASCII characters */
12579 else {
12580 /* Map Unicode whitespace and control characters
12581 (categories Z* and C* except ASCII space)
12582 */
12583 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12584 PyUnicode_WRITE(okind, odata, o++, '\\');
12585 /* Map 8-bit characters to '\xhh' */
12586 if (ch <= 0xff) {
12587 PyUnicode_WRITE(okind, odata, o++, 'x');
12588 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12589 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12590 }
12591 /* Map 16-bit characters to '\uxxxx' */
12592 else if (ch <= 0xffff) {
12593 PyUnicode_WRITE(okind, odata, o++, 'u');
12594 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12595 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12596 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12597 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12598 }
12599 /* Map 21-bit characters to '\U00xxxxxx' */
12600 else {
12601 PyUnicode_WRITE(okind, odata, o++, 'U');
12602 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12603 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12604 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12605 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12606 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12607 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12608 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12609 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12610 }
12611 }
12612 /* Copy characters as-is */
12613 else {
12614 PyUnicode_WRITE(okind, odata, o++, ch);
12615 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012616 }
12617 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012619 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012620 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012621 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622}
12623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012624PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012625 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626\n\
12627Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012628such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629arguments start and end are interpreted as in slice notation.\n\
12630\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012631Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632
12633static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012636 /* initialize variables to prevent gcc warning */
12637 PyObject *substring = NULL;
12638 Py_ssize_t start = 0;
12639 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012640 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012642 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012645 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012648 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 if (result == -2)
12651 return NULL;
12652
Christian Heimes217cfd12007-12-02 14:31:20 +000012653 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012654}
12655
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012656PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012657 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012659Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660
12661static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012664 /* initialize variables to prevent gcc warning */
12665 PyObject *substring = NULL;
12666 Py_ssize_t start = 0;
12667 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012668 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012670 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012671 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012673 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012674 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012676 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678 if (result == -2)
12679 return NULL;
12680
Guido van Rossumd57fd912000-03-10 22:53:23 +000012681 if (result < 0) {
12682 PyErr_SetString(PyExc_ValueError, "substring not found");
12683 return NULL;
12684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685
Christian Heimes217cfd12007-12-02 14:31:20 +000012686 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687}
12688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012689PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012690 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012692Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012693done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694
12695static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012696unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012698 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 Py_UCS4 fillchar = ' ';
12700
Victor Stinnere9a29352011-10-01 02:14:59 +020012701 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012702 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012703
Benjamin Petersonbac79492012-01-14 13:34:47 -050012704 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705 return NULL;
12706
Victor Stinnerc4b49542011-12-11 22:44:26 +010012707 if (PyUnicode_GET_LENGTH(self) >= width)
12708 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709
Victor Stinnerc4b49542011-12-11 22:44:26 +010012710 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012711}
12712
Alexander Belopolsky40018472011-02-26 01:02:56 +000012713PyObject *
12714PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012715{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012716 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012717 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012718
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012719 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720}
12721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012722PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012723 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012724\n\
12725Return a list of the words in S, using sep as the\n\
12726delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012727splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012728whitespace string is a separator and empty strings are\n\
12729removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730
12731static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012732unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012734 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012735 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012736 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012738 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12739 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740 return NULL;
12741
12742 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012743 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012744
12745 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012746 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012747
12748 PyErr_Format(PyExc_TypeError,
12749 "must be str or None, not %.100s",
12750 Py_TYPE(substring)->tp_name);
12751 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012752}
12753
Thomas Wouters477c8d52006-05-27 19:21:47 +000012754PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012755PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012756{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012757 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012758 int kind1, kind2;
12759 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012760 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012761
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012762 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012763 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012764
Victor Stinner14f8f022011-10-05 20:58:25 +020012765 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012766 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012767 len1 = PyUnicode_GET_LENGTH(str_obj);
12768 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012769 if (kind1 < kind2 || len1 < len2) {
12770 _Py_INCREF_UNICODE_EMPTY();
12771 if (!unicode_empty)
12772 out = NULL;
12773 else {
12774 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12775 Py_DECREF(unicode_empty);
12776 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012777 return out;
12778 }
12779 buf1 = PyUnicode_DATA(str_obj);
12780 buf2 = PyUnicode_DATA(sep_obj);
12781 if (kind2 != kind1) {
12782 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12783 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012784 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012785 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012786
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012787 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012789 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12790 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12791 else
12792 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012793 break;
12794 case PyUnicode_2BYTE_KIND:
12795 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12796 break;
12797 case PyUnicode_4BYTE_KIND:
12798 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12799 break;
12800 default:
12801 assert(0);
12802 out = 0;
12803 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012804
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012805 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012806 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012807
12808 return out;
12809}
12810
12811
12812PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012813PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012814{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012815 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012816 int kind1, kind2;
12817 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012819
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012820 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012822
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012823 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012824 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825 len1 = PyUnicode_GET_LENGTH(str_obj);
12826 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012827 if (kind1 < kind2 || len1 < len2) {
12828 _Py_INCREF_UNICODE_EMPTY();
12829 if (!unicode_empty)
12830 out = NULL;
12831 else {
12832 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12833 Py_DECREF(unicode_empty);
12834 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012835 return out;
12836 }
12837 buf1 = PyUnicode_DATA(str_obj);
12838 buf2 = PyUnicode_DATA(sep_obj);
12839 if (kind2 != kind1) {
12840 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12841 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012842 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012845 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012846 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012847 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12848 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12849 else
12850 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 break;
12852 case PyUnicode_2BYTE_KIND:
12853 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12854 break;
12855 case PyUnicode_4BYTE_KIND:
12856 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12857 break;
12858 default:
12859 assert(0);
12860 out = 0;
12861 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012862
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012863 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012864 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012865
12866 return out;
12867}
12868
12869PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012870 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012871\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012872Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012873the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012874found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012875
12876static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012877unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012878{
Victor Stinner9310abb2011-10-05 00:59:23 +020012879 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012880}
12881
12882PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012883 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012884\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012885Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012886the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012887separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012888
12889static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012890unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012891{
Victor Stinner9310abb2011-10-05 00:59:23 +020012892 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012893}
12894
Alexander Belopolsky40018472011-02-26 01:02:56 +000012895PyObject *
12896PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012897{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012898 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012899 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012900
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012901 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012902}
12903
12904PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012905 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012906\n\
12907Return a list of the words in S, using sep as the\n\
12908delimiter string, starting at the end of the string and\n\
12909working to the front. If maxsplit is given, at most maxsplit\n\
12910splits are done. If sep is not specified, any whitespace string\n\
12911is a separator.");
12912
12913static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012914unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012915{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012916 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012917 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012918 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012919
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012920 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12921 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012922 return NULL;
12923
12924 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012925 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012926
12927 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012928 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012929
12930 PyErr_Format(PyExc_TypeError,
12931 "must be str or None, not %.100s",
12932 Py_TYPE(substring)->tp_name);
12933 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012934}
12935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012936PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012937 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012938\n\
12939Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012940Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012941is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012942
12943static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012944unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012945{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012946 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012947 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012949 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12950 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012951 return NULL;
12952
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012953 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954}
12955
12956static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012957PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012959 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012960}
12961
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012962PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012963 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012964\n\
12965Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012966and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967
12968static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012969unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012970{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012971 if (PyUnicode_READY(self) == -1)
12972 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012973 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012974}
12975
Larry Hastings61272b72014-01-07 12:41:53 -080012976/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012977
Larry Hastings31826802013-10-19 00:09:25 -070012978@staticmethod
12979str.maketrans as unicode_maketrans
12980
12981 x: object
12982
12983 y: unicode=NULL
12984
12985 z: unicode=NULL
12986
12987 /
12988
12989Return a translation table usable for str.translate().
12990
12991If there is only one argument, it must be a dictionary mapping Unicode
12992ordinals (integers) or characters to Unicode ordinals, strings or None.
12993Character keys will be then converted to ordinals.
12994If there are two arguments, they must be strings of equal length, and
12995in the resulting dictionary, each character in x will be mapped to the
12996character at the same position in y. If there is a third argument, it
12997must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012998[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012999
Larry Hastings31826802013-10-19 00:09:25 -070013000static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013001unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013002/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013003{
Georg Brandlceee0772007-11-27 23:48:05 +000013004 PyObject *new = NULL, *key, *value;
13005 Py_ssize_t i = 0;
13006 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013007
Georg Brandlceee0772007-11-27 23:48:05 +000013008 new = PyDict_New();
13009 if (!new)
13010 return NULL;
13011 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013012 int x_kind, y_kind, z_kind;
13013 void *x_data, *y_data, *z_data;
13014
Georg Brandlceee0772007-11-27 23:48:05 +000013015 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013016 if (!PyUnicode_Check(x)) {
13017 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13018 "be a string if there is a second argument");
13019 goto err;
13020 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013021 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013022 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13023 "arguments must have equal length");
13024 goto err;
13025 }
13026 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013027 x_kind = PyUnicode_KIND(x);
13028 y_kind = PyUnicode_KIND(y);
13029 x_data = PyUnicode_DATA(x);
13030 y_data = PyUnicode_DATA(y);
13031 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13032 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013033 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013034 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013035 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013036 if (!value) {
13037 Py_DECREF(key);
13038 goto err;
13039 }
Georg Brandlceee0772007-11-27 23:48:05 +000013040 res = PyDict_SetItem(new, key, value);
13041 Py_DECREF(key);
13042 Py_DECREF(value);
13043 if (res < 0)
13044 goto err;
13045 }
13046 /* create entries for deleting chars in z */
13047 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013048 z_kind = PyUnicode_KIND(z);
13049 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013050 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013051 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013052 if (!key)
13053 goto err;
13054 res = PyDict_SetItem(new, key, Py_None);
13055 Py_DECREF(key);
13056 if (res < 0)
13057 goto err;
13058 }
13059 }
13060 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013061 int kind;
13062 void *data;
13063
Georg Brandlceee0772007-11-27 23:48:05 +000013064 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013065 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013066 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13067 "to maketrans it must be a dict");
13068 goto err;
13069 }
13070 /* copy entries into the new dict, converting string keys to int keys */
13071 while (PyDict_Next(x, &i, &key, &value)) {
13072 if (PyUnicode_Check(key)) {
13073 /* convert string keys to integer keys */
13074 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013075 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013076 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13077 "table must be of length 1");
13078 goto err;
13079 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013080 kind = PyUnicode_KIND(key);
13081 data = PyUnicode_DATA(key);
13082 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013083 if (!newkey)
13084 goto err;
13085 res = PyDict_SetItem(new, newkey, value);
13086 Py_DECREF(newkey);
13087 if (res < 0)
13088 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013089 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013090 /* just keep integer keys */
13091 if (PyDict_SetItem(new, key, value) < 0)
13092 goto err;
13093 } else {
13094 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13095 "be strings or integers");
13096 goto err;
13097 }
13098 }
13099 }
13100 return new;
13101 err:
13102 Py_DECREF(new);
13103 return NULL;
13104}
13105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013106PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013107 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013109Return a copy of the string S in which each character has been mapped\n\
13110through the given translation table. The table must implement\n\
13111lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13112mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13113this operation raises LookupError, the character is left untouched.\n\
13114Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115
13116static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013117unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013119 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120}
13121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013122PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013123 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013125Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126
13127static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013128unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013130 if (PyUnicode_READY(self) == -1)
13131 return NULL;
13132 if (PyUnicode_IS_ASCII(self))
13133 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013134 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135}
13136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013137PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013138 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013140Pad a numeric string S with zeros on the left, to fill a field\n\
13141of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142
13143static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013144unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013146 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013147 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013148 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149 int kind;
13150 void *data;
13151 Py_UCS4 chr;
13152
Martin v. Löwis18e16552006-02-15 17:27:45 +000013153 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013154 return NULL;
13155
Benjamin Petersonbac79492012-01-14 13:34:47 -050013156 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013157 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013158
Victor Stinnerc4b49542011-12-11 22:44:26 +010013159 if (PyUnicode_GET_LENGTH(self) >= width)
13160 return unicode_result_unchanged(self);
13161
13162 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013163
13164 u = pad(self, fill, 0, '0');
13165
Walter Dörwald068325e2002-04-15 13:36:47 +000013166 if (u == NULL)
13167 return NULL;
13168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013169 kind = PyUnicode_KIND(u);
13170 data = PyUnicode_DATA(u);
13171 chr = PyUnicode_READ(kind, data, fill);
13172
13173 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013174 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013175 PyUnicode_WRITE(kind, data, 0, chr);
13176 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013177 }
13178
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013179 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013180 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013181}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013182
13183#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013184static PyObject *
13185unicode__decimal2ascii(PyObject *self)
13186{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013187 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013188}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189#endif
13190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013191PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013192 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013193\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013194Return True if S starts with the specified prefix, False otherwise.\n\
13195With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013196With optional end, stop comparing S at that position.\n\
13197prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198
13199static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013200unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013201 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013203 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013204 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013205 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013206 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013207 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013208
Jesus Ceaac451502011-04-20 17:09:23 +020013209 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013210 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013211 if (PyTuple_Check(subobj)) {
13212 Py_ssize_t i;
13213 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013214 substring = PyTuple_GET_ITEM(subobj, i);
13215 if (!PyUnicode_Check(substring)) {
13216 PyErr_Format(PyExc_TypeError,
13217 "tuple for startswith must only contain str, "
13218 "not %.100s",
13219 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013220 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013221 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013222 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013223 if (result == -1)
13224 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013225 if (result) {
13226 Py_RETURN_TRUE;
13227 }
13228 }
13229 /* nothing matched */
13230 Py_RETURN_FALSE;
13231 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013232 if (!PyUnicode_Check(subobj)) {
13233 PyErr_Format(PyExc_TypeError,
13234 "startswith first arg must be str or "
13235 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013236 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013237 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013238 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013239 if (result == -1)
13240 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013241 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013242}
13243
13244
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013245PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013246 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013247\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013248Return True if S ends with the specified suffix, False otherwise.\n\
13249With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013250With optional end, stop comparing S at that position.\n\
13251suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013252
13253static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013254unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013255 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013256{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013257 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013258 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013259 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013260 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013261 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013262
Jesus Ceaac451502011-04-20 17:09:23 +020013263 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013264 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013265 if (PyTuple_Check(subobj)) {
13266 Py_ssize_t i;
13267 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013268 substring = PyTuple_GET_ITEM(subobj, i);
13269 if (!PyUnicode_Check(substring)) {
13270 PyErr_Format(PyExc_TypeError,
13271 "tuple for endswith must only contain str, "
13272 "not %.100s",
13273 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013274 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013275 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013276 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013277 if (result == -1)
13278 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013279 if (result) {
13280 Py_RETURN_TRUE;
13281 }
13282 }
13283 Py_RETURN_FALSE;
13284 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013285 if (!PyUnicode_Check(subobj)) {
13286 PyErr_Format(PyExc_TypeError,
13287 "endswith first arg must be str or "
13288 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013289 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013290 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013291 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013292 if (result == -1)
13293 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013294 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295}
13296
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013297static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013298_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013299{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013300 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13301 writer->data = PyUnicode_DATA(writer->buffer);
13302
13303 if (!writer->readonly) {
13304 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013305 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013306 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013307 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013308 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13309 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13310 writer->kind = PyUnicode_WCHAR_KIND;
13311 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13312
Victor Stinner8f674cc2013-04-17 23:02:17 +020013313 /* Copy-on-write mode: set buffer size to 0 so
13314 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13315 * next write. */
13316 writer->size = 0;
13317 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013318}
13319
Victor Stinnerd3f08822012-05-29 12:57:52 +020013320void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013321_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013322{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013323 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013324
13325 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013326 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013327
13328 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13329 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13330 writer->kind = PyUnicode_WCHAR_KIND;
13331 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013332}
13333
Victor Stinnerd3f08822012-05-29 12:57:52 +020013334int
13335_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13336 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013337{
13338 Py_ssize_t newlen;
13339 PyObject *newbuffer;
13340
Victor Stinner2740e462016-09-06 16:58:36 -070013341 assert(maxchar <= MAX_UNICODE);
13342
Victor Stinnerca9381e2015-09-22 00:58:32 +020013343 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013344 assert((maxchar > writer->maxchar && length >= 0)
13345 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013346
Victor Stinner202fdca2012-05-07 12:47:02 +020013347 if (length > PY_SSIZE_T_MAX - writer->pos) {
13348 PyErr_NoMemory();
13349 return -1;
13350 }
13351 newlen = writer->pos + length;
13352
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013353 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013354
Victor Stinnerd3f08822012-05-29 12:57:52 +020013355 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013356 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013357 if (writer->overallocate
13358 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13359 /* overallocate to limit the number of realloc() */
13360 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013361 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013362 if (newlen < writer->min_length)
13363 newlen = writer->min_length;
13364
Victor Stinnerd3f08822012-05-29 12:57:52 +020013365 writer->buffer = PyUnicode_New(newlen, maxchar);
13366 if (writer->buffer == NULL)
13367 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013368 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013369 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013370 if (writer->overallocate
13371 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13372 /* overallocate to limit the number of realloc() */
13373 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013374 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013375 if (newlen < writer->min_length)
13376 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013377
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013378 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013379 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013380 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013381 newbuffer = PyUnicode_New(newlen, maxchar);
13382 if (newbuffer == NULL)
13383 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013384 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13385 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013386 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013387 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013388 }
13389 else {
13390 newbuffer = resize_compact(writer->buffer, newlen);
13391 if (newbuffer == NULL)
13392 return -1;
13393 }
13394 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013395 }
13396 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013397 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013398 newbuffer = PyUnicode_New(writer->size, maxchar);
13399 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013400 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013401 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13402 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013403 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013404 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013405 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013406 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013407
13408#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013409}
13410
Victor Stinnerca9381e2015-09-22 00:58:32 +020013411int
13412_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13413 enum PyUnicode_Kind kind)
13414{
13415 Py_UCS4 maxchar;
13416
13417 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13418 assert(writer->kind < kind);
13419
13420 switch (kind)
13421 {
13422 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13423 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13424 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13425 default:
13426 assert(0 && "invalid kind");
13427 return -1;
13428 }
13429
13430 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13431}
13432
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013433static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013434_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013435{
Victor Stinner2740e462016-09-06 16:58:36 -070013436 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013437 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13438 return -1;
13439 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13440 writer->pos++;
13441 return 0;
13442}
13443
13444int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013445_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13446{
13447 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13448}
13449
13450int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013451_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13452{
13453 Py_UCS4 maxchar;
13454 Py_ssize_t len;
13455
13456 if (PyUnicode_READY(str) == -1)
13457 return -1;
13458 len = PyUnicode_GET_LENGTH(str);
13459 if (len == 0)
13460 return 0;
13461 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13462 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013463 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013464 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013465 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013466 Py_INCREF(str);
13467 writer->buffer = str;
13468 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013469 writer->pos += len;
13470 return 0;
13471 }
13472 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13473 return -1;
13474 }
13475 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13476 str, 0, len);
13477 writer->pos += len;
13478 return 0;
13479}
13480
Victor Stinnere215d962012-10-06 23:03:36 +020013481int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013482_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13483 Py_ssize_t start, Py_ssize_t end)
13484{
13485 Py_UCS4 maxchar;
13486 Py_ssize_t len;
13487
13488 if (PyUnicode_READY(str) == -1)
13489 return -1;
13490
13491 assert(0 <= start);
13492 assert(end <= PyUnicode_GET_LENGTH(str));
13493 assert(start <= end);
13494
13495 if (end == 0)
13496 return 0;
13497
13498 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13499 return _PyUnicodeWriter_WriteStr(writer, str);
13500
13501 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13502 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13503 else
13504 maxchar = writer->maxchar;
13505 len = end - start;
13506
13507 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13508 return -1;
13509
13510 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13511 str, start, len);
13512 writer->pos += len;
13513 return 0;
13514}
13515
13516int
Victor Stinner4a587072013-11-19 12:54:53 +010013517_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13518 const char *ascii, Py_ssize_t len)
13519{
13520 if (len == -1)
13521 len = strlen(ascii);
13522
13523 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13524
13525 if (writer->buffer == NULL && !writer->overallocate) {
13526 PyObject *str;
13527
13528 str = _PyUnicode_FromASCII(ascii, len);
13529 if (str == NULL)
13530 return -1;
13531
13532 writer->readonly = 1;
13533 writer->buffer = str;
13534 _PyUnicodeWriter_Update(writer);
13535 writer->pos += len;
13536 return 0;
13537 }
13538
13539 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13540 return -1;
13541
13542 switch (writer->kind)
13543 {
13544 case PyUnicode_1BYTE_KIND:
13545 {
13546 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13547 Py_UCS1 *data = writer->data;
13548
Christian Heimesf051e432016-09-13 20:22:02 +020013549 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013550 break;
13551 }
13552 case PyUnicode_2BYTE_KIND:
13553 {
13554 _PyUnicode_CONVERT_BYTES(
13555 Py_UCS1, Py_UCS2,
13556 ascii, ascii + len,
13557 (Py_UCS2 *)writer->data + writer->pos);
13558 break;
13559 }
13560 case PyUnicode_4BYTE_KIND:
13561 {
13562 _PyUnicode_CONVERT_BYTES(
13563 Py_UCS1, Py_UCS4,
13564 ascii, ascii + len,
13565 (Py_UCS4 *)writer->data + writer->pos);
13566 break;
13567 }
13568 default:
13569 assert(0);
13570 }
13571
13572 writer->pos += len;
13573 return 0;
13574}
13575
13576int
13577_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13578 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013579{
13580 Py_UCS4 maxchar;
13581
13582 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13583 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13584 return -1;
13585 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13586 writer->pos += len;
13587 return 0;
13588}
13589
Victor Stinnerd3f08822012-05-29 12:57:52 +020013590PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013591_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013592{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013593 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013594
Victor Stinnerd3f08822012-05-29 12:57:52 +020013595 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013596 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013597 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013598 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013599
13600 str = writer->buffer;
13601 writer->buffer = NULL;
13602
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013603 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013604 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13605 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013606 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013607
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013608 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13609 PyObject *str2;
13610 str2 = resize_compact(str, writer->pos);
13611 if (str2 == NULL) {
13612 Py_DECREF(str);
13613 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013614 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013615 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013616 }
13617
Victor Stinner15a0bd32013-07-08 22:29:55 +020013618 assert(_PyUnicode_CheckConsistency(str, 1));
13619 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013620}
13621
Victor Stinnerd3f08822012-05-29 12:57:52 +020013622void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013623_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013624{
13625 Py_CLEAR(writer->buffer);
13626}
13627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013628#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013629
13630PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013631 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013632\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013633Return a formatted version of S, using substitutions from args and kwargs.\n\
13634The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013635
Eric Smith27bbca62010-11-04 17:06:58 +000013636PyDoc_STRVAR(format_map__doc__,
13637 "S.format_map(mapping) -> str\n\
13638\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013639Return a formatted version of S, using substitutions from mapping.\n\
13640The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013641
Eric Smith4a7d76d2008-05-30 18:10:19 +000013642static PyObject *
13643unicode__format__(PyObject* self, PyObject* args)
13644{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013645 PyObject *format_spec;
13646 _PyUnicodeWriter writer;
13647 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013648
13649 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13650 return NULL;
13651
Victor Stinnerd3f08822012-05-29 12:57:52 +020013652 if (PyUnicode_READY(self) == -1)
13653 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013654 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013655 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13656 self, format_spec, 0,
13657 PyUnicode_GET_LENGTH(format_spec));
13658 if (ret == -1) {
13659 _PyUnicodeWriter_Dealloc(&writer);
13660 return NULL;
13661 }
13662 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013663}
13664
Eric Smith8c663262007-08-25 02:26:07 +000013665PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013666 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013667\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013668Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013669
13670static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013671unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013672{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013673 Py_ssize_t size;
13674
13675 /* If it's a compact object, account for base structure +
13676 character data. */
13677 if (PyUnicode_IS_COMPACT_ASCII(v))
13678 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13679 else if (PyUnicode_IS_COMPACT(v))
13680 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013681 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013682 else {
13683 /* If it is a two-block object, account for base object, and
13684 for character block if present. */
13685 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013686 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013687 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013688 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013689 }
13690 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013691 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013692 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013693 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013694 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013695 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013696
13697 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013698}
13699
13700PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013701 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013702
13703static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013704unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013705{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013706 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013707 if (!copy)
13708 return NULL;
13709 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013710}
13711
Guido van Rossumd57fd912000-03-10 22:53:23 +000013712static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013713 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013714 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013715 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13716 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013717 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13718 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013719 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013720 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13721 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13722 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013723 {"expandtabs", (PyCFunction) unicode_expandtabs,
13724 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013725 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013726 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013727 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13728 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13729 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013730 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013731 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13732 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13733 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013734 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013735 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013736 {"splitlines", (PyCFunction) unicode_splitlines,
13737 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013738 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013739 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13740 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13741 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13742 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13743 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13744 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13745 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13746 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13747 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13748 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13749 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13750 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13751 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13752 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013753 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013754 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013755 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013756 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013757 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013758 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013759 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013760 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013761#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013762 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013763 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013764#endif
13765
Benjamin Peterson14339b62009-01-31 16:36:08 +000013766 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013767 {NULL, NULL}
13768};
13769
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013770static PyObject *
13771unicode_mod(PyObject *v, PyObject *w)
13772{
Brian Curtindfc80e32011-08-10 20:28:54 -050013773 if (!PyUnicode_Check(v))
13774 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013775 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013776}
13777
13778static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013779 0, /*nb_add*/
13780 0, /*nb_subtract*/
13781 0, /*nb_multiply*/
13782 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013783};
13784
Guido van Rossumd57fd912000-03-10 22:53:23 +000013785static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013786 (lenfunc) unicode_length, /* sq_length */
13787 PyUnicode_Concat, /* sq_concat */
13788 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13789 (ssizeargfunc) unicode_getitem, /* sq_item */
13790 0, /* sq_slice */
13791 0, /* sq_ass_item */
13792 0, /* sq_ass_slice */
13793 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013794};
13795
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013796static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013797unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013799 if (PyUnicode_READY(self) == -1)
13800 return NULL;
13801
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013802 if (PyIndex_Check(item)) {
13803 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013804 if (i == -1 && PyErr_Occurred())
13805 return NULL;
13806 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013807 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013808 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013809 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013810 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013811 PyObject *result;
13812 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013813 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013814 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013816 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013817 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013818 return NULL;
13819 }
13820
13821 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013822 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013823 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013824 slicelength == PyUnicode_GET_LENGTH(self)) {
13825 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013826 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013827 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013828 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013829 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013830 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013831 src_kind = PyUnicode_KIND(self);
13832 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013833 if (!PyUnicode_IS_ASCII(self)) {
13834 kind_limit = kind_maxchar_limit(src_kind);
13835 max_char = 0;
13836 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13837 ch = PyUnicode_READ(src_kind, src_data, cur);
13838 if (ch > max_char) {
13839 max_char = ch;
13840 if (max_char >= kind_limit)
13841 break;
13842 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013843 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013844 }
Victor Stinner55c99112011-10-13 01:17:06 +020013845 else
13846 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013847 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013848 if (result == NULL)
13849 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013850 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013851 dest_data = PyUnicode_DATA(result);
13852
13853 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013854 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13855 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013856 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013857 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013858 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013859 } else {
13860 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13861 return NULL;
13862 }
13863}
13864
13865static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013866 (lenfunc)unicode_length, /* mp_length */
13867 (binaryfunc)unicode_subscript, /* mp_subscript */
13868 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013869};
13870
Guido van Rossumd57fd912000-03-10 22:53:23 +000013871
Guido van Rossumd57fd912000-03-10 22:53:23 +000013872/* Helpers for PyUnicode_Format() */
13873
Victor Stinnera47082312012-10-04 02:19:54 +020013874struct unicode_formatter_t {
13875 PyObject *args;
13876 int args_owned;
13877 Py_ssize_t arglen, argidx;
13878 PyObject *dict;
13879
13880 enum PyUnicode_Kind fmtkind;
13881 Py_ssize_t fmtcnt, fmtpos;
13882 void *fmtdata;
13883 PyObject *fmtstr;
13884
13885 _PyUnicodeWriter writer;
13886};
13887
13888struct unicode_format_arg_t {
13889 Py_UCS4 ch;
13890 int flags;
13891 Py_ssize_t width;
13892 int prec;
13893 int sign;
13894};
13895
Guido van Rossumd57fd912000-03-10 22:53:23 +000013896static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013897unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013898{
Victor Stinnera47082312012-10-04 02:19:54 +020013899 Py_ssize_t argidx = ctx->argidx;
13900
13901 if (argidx < ctx->arglen) {
13902 ctx->argidx++;
13903 if (ctx->arglen < 0)
13904 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013905 else
Victor Stinnera47082312012-10-04 02:19:54 +020013906 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013907 }
13908 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013909 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013910 return NULL;
13911}
13912
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013913/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013914
Victor Stinnera47082312012-10-04 02:19:54 +020013915/* Format a float into the writer if the writer is not NULL, or into *p_output
13916 otherwise.
13917
13918 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013919static int
Victor Stinnera47082312012-10-04 02:19:54 +020013920formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13921 PyObject **p_output,
13922 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013923{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013924 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013925 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013926 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013927 int prec;
13928 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013929
Guido van Rossumd57fd912000-03-10 22:53:23 +000013930 x = PyFloat_AsDouble(v);
13931 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013932 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013933
Victor Stinnera47082312012-10-04 02:19:54 +020013934 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013935 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013936 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013937
Victor Stinnera47082312012-10-04 02:19:54 +020013938 if (arg->flags & F_ALT)
13939 dtoa_flags = Py_DTSF_ALT;
13940 else
13941 dtoa_flags = 0;
13942 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013943 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013944 return -1;
13945 len = strlen(p);
13946 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013947 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013948 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013949 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013950 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013951 }
13952 else
13953 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013954 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013955 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013956}
13957
Victor Stinnerd0880d52012-04-27 23:40:13 +020013958/* formatlong() emulates the format codes d, u, o, x and X, and
13959 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13960 * Python's regular ints.
13961 * Return value: a new PyUnicodeObject*, or NULL if error.
13962 * The output string is of the form
13963 * "-"? ("0x" | "0X")? digit+
13964 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13965 * set in flags. The case of hex digits will be correct,
13966 * There will be at least prec digits, zero-filled on the left if
13967 * necessary to get that many.
13968 * val object to be converted
13969 * flags bitmask of format flags; only F_ALT is looked at
13970 * prec minimum number of digits; 0-fill on left if needed
13971 * type a character in [duoxX]; u acts the same as d
13972 *
13973 * CAUTION: o, x and X conversions on regular ints can never
13974 * produce a '-' sign, but can for Python's unbounded ints.
13975 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013976PyObject *
13977_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013978{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013979 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013980 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013981 Py_ssize_t i;
13982 int sign; /* 1 if '-', else 0 */
13983 int len; /* number of characters */
13984 Py_ssize_t llen;
13985 int numdigits; /* len == numnondigits + numdigits */
13986 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013987
Victor Stinnerd0880d52012-04-27 23:40:13 +020013988 /* Avoid exceeding SSIZE_T_MAX */
13989 if (prec > INT_MAX-3) {
13990 PyErr_SetString(PyExc_OverflowError,
13991 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013992 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013993 }
13994
13995 assert(PyLong_Check(val));
13996
13997 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013998 default:
13999 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014000 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014001 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014002 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014003 /* int and int subclasses should print numerically when a numeric */
14004 /* format code is used (see issue18780) */
14005 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014006 break;
14007 case 'o':
14008 numnondigits = 2;
14009 result = PyNumber_ToBase(val, 8);
14010 break;
14011 case 'x':
14012 case 'X':
14013 numnondigits = 2;
14014 result = PyNumber_ToBase(val, 16);
14015 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014016 }
14017 if (!result)
14018 return NULL;
14019
14020 assert(unicode_modifiable(result));
14021 assert(PyUnicode_IS_READY(result));
14022 assert(PyUnicode_IS_ASCII(result));
14023
14024 /* To modify the string in-place, there can only be one reference. */
14025 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014026 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014027 PyErr_BadInternalCall();
14028 return NULL;
14029 }
14030 buf = PyUnicode_DATA(result);
14031 llen = PyUnicode_GET_LENGTH(result);
14032 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014033 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014034 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014035 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014036 return NULL;
14037 }
14038 len = (int)llen;
14039 sign = buf[0] == '-';
14040 numnondigits += sign;
14041 numdigits = len - numnondigits;
14042 assert(numdigits > 0);
14043
14044 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014045 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014046 (type == 'o' || type == 'x' || type == 'X'))) {
14047 assert(buf[sign] == '0');
14048 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14049 buf[sign+1] == 'o');
14050 numnondigits -= 2;
14051 buf += 2;
14052 len -= 2;
14053 if (sign)
14054 buf[0] = '-';
14055 assert(len == numnondigits + numdigits);
14056 assert(numdigits > 0);
14057 }
14058
14059 /* Fill with leading zeroes to meet minimum width. */
14060 if (prec > numdigits) {
14061 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14062 numnondigits + prec);
14063 char *b1;
14064 if (!r1) {
14065 Py_DECREF(result);
14066 return NULL;
14067 }
14068 b1 = PyBytes_AS_STRING(r1);
14069 for (i = 0; i < numnondigits; ++i)
14070 *b1++ = *buf++;
14071 for (i = 0; i < prec - numdigits; i++)
14072 *b1++ = '0';
14073 for (i = 0; i < numdigits; i++)
14074 *b1++ = *buf++;
14075 *b1 = '\0';
14076 Py_DECREF(result);
14077 result = r1;
14078 buf = PyBytes_AS_STRING(result);
14079 len = numnondigits + prec;
14080 }
14081
14082 /* Fix up case for hex conversions. */
14083 if (type == 'X') {
14084 /* Need to convert all lower case letters to upper case.
14085 and need to convert 0x to 0X (and -0x to -0X). */
14086 for (i = 0; i < len; i++)
14087 if (buf[i] >= 'a' && buf[i] <= 'x')
14088 buf[i] -= 'a'-'A';
14089 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014090 if (!PyUnicode_Check(result)
14091 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014092 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014093 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014094 Py_DECREF(result);
14095 result = unicode;
14096 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014097 else if (len != PyUnicode_GET_LENGTH(result)) {
14098 if (PyUnicode_Resize(&result, len) < 0)
14099 Py_CLEAR(result);
14100 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014101 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014102}
14103
Ethan Furmandf3ed242014-01-05 06:50:30 -080014104/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014105 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014106 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014107 * -1 and raise an exception on error */
14108static int
Victor Stinnera47082312012-10-04 02:19:54 +020014109mainformatlong(PyObject *v,
14110 struct unicode_format_arg_t *arg,
14111 PyObject **p_output,
14112 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014113{
14114 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014115 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014116
14117 if (!PyNumber_Check(v))
14118 goto wrongtype;
14119
Ethan Furman9ab74802014-03-21 06:38:46 -070014120 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014121 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014122 if (type == 'o' || type == 'x' || type == 'X') {
14123 iobj = PyNumber_Index(v);
14124 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014125 if (PyErr_ExceptionMatches(PyExc_TypeError))
14126 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014127 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014128 }
14129 }
14130 else {
14131 iobj = PyNumber_Long(v);
14132 if (iobj == NULL ) {
14133 if (PyErr_ExceptionMatches(PyExc_TypeError))
14134 goto wrongtype;
14135 return -1;
14136 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014137 }
14138 assert(PyLong_Check(iobj));
14139 }
14140 else {
14141 iobj = v;
14142 Py_INCREF(iobj);
14143 }
14144
14145 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014146 && arg->width == -1 && arg->prec == -1
14147 && !(arg->flags & (F_SIGN | F_BLANK))
14148 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014149 {
14150 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014151 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014152 int base;
14153
Victor Stinnera47082312012-10-04 02:19:54 +020014154 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014155 {
14156 default:
14157 assert(0 && "'type' not in [diuoxX]");
14158 case 'd':
14159 case 'i':
14160 case 'u':
14161 base = 10;
14162 break;
14163 case 'o':
14164 base = 8;
14165 break;
14166 case 'x':
14167 case 'X':
14168 base = 16;
14169 break;
14170 }
14171
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014172 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14173 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014174 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014175 }
14176 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014177 return 1;
14178 }
14179
Ethan Furmanb95b5612015-01-23 20:05:18 -080014180 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014181 Py_DECREF(iobj);
14182 if (res == NULL)
14183 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014184 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014185 return 0;
14186
14187wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014188 switch(type)
14189 {
14190 case 'o':
14191 case 'x':
14192 case 'X':
14193 PyErr_Format(PyExc_TypeError,
14194 "%%%c format: an integer is required, "
14195 "not %.200s",
14196 type, Py_TYPE(v)->tp_name);
14197 break;
14198 default:
14199 PyErr_Format(PyExc_TypeError,
14200 "%%%c format: a number is required, "
14201 "not %.200s",
14202 type, Py_TYPE(v)->tp_name);
14203 break;
14204 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014205 return -1;
14206}
14207
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014208static Py_UCS4
14209formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014210{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014211 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014212 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014213 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014214 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014215 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014216 goto onError;
14217 }
14218 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014219 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014220 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014221 /* make sure number is a type of integer */
14222 if (!PyLong_Check(v)) {
14223 iobj = PyNumber_Index(v);
14224 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014225 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014226 }
14227 v = iobj;
14228 Py_DECREF(iobj);
14229 }
14230 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014231 x = PyLong_AsLong(v);
14232 if (x == -1 && PyErr_Occurred())
14233 goto onError;
14234
Victor Stinner8faf8212011-12-08 22:14:11 +010014235 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014236 PyErr_SetString(PyExc_OverflowError,
14237 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014238 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014239 }
14240
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014241 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014242 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014243
Benjamin Peterson29060642009-01-31 22:14:21 +000014244 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014245 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014246 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014247 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014248}
14249
Victor Stinnera47082312012-10-04 02:19:54 +020014250/* Parse options of an argument: flags, width, precision.
14251 Handle also "%(name)" syntax.
14252
14253 Return 0 if the argument has been formatted into arg->str.
14254 Return 1 if the argument has been written into ctx->writer,
14255 Raise an exception and return -1 on error. */
14256static int
14257unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14258 struct unicode_format_arg_t *arg)
14259{
14260#define FORMAT_READ(ctx) \
14261 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14262
14263 PyObject *v;
14264
Victor Stinnera47082312012-10-04 02:19:54 +020014265 if (arg->ch == '(') {
14266 /* Get argument value from a dictionary. Example: "%(name)s". */
14267 Py_ssize_t keystart;
14268 Py_ssize_t keylen;
14269 PyObject *key;
14270 int pcount = 1;
14271
14272 if (ctx->dict == NULL) {
14273 PyErr_SetString(PyExc_TypeError,
14274 "format requires a mapping");
14275 return -1;
14276 }
14277 ++ctx->fmtpos;
14278 --ctx->fmtcnt;
14279 keystart = ctx->fmtpos;
14280 /* Skip over balanced parentheses */
14281 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14282 arg->ch = FORMAT_READ(ctx);
14283 if (arg->ch == ')')
14284 --pcount;
14285 else if (arg->ch == '(')
14286 ++pcount;
14287 ctx->fmtpos++;
14288 }
14289 keylen = ctx->fmtpos - keystart - 1;
14290 if (ctx->fmtcnt < 0 || pcount > 0) {
14291 PyErr_SetString(PyExc_ValueError,
14292 "incomplete format key");
14293 return -1;
14294 }
14295 key = PyUnicode_Substring(ctx->fmtstr,
14296 keystart, keystart + keylen);
14297 if (key == NULL)
14298 return -1;
14299 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014300 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014301 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014302 }
14303 ctx->args = PyObject_GetItem(ctx->dict, key);
14304 Py_DECREF(key);
14305 if (ctx->args == NULL)
14306 return -1;
14307 ctx->args_owned = 1;
14308 ctx->arglen = -1;
14309 ctx->argidx = -2;
14310 }
14311
14312 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014313 while (--ctx->fmtcnt >= 0) {
14314 arg->ch = FORMAT_READ(ctx);
14315 ctx->fmtpos++;
14316 switch (arg->ch) {
14317 case '-': arg->flags |= F_LJUST; continue;
14318 case '+': arg->flags |= F_SIGN; continue;
14319 case ' ': arg->flags |= F_BLANK; continue;
14320 case '#': arg->flags |= F_ALT; continue;
14321 case '0': arg->flags |= F_ZERO; continue;
14322 }
14323 break;
14324 }
14325
14326 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014327 if (arg->ch == '*') {
14328 v = unicode_format_getnextarg(ctx);
14329 if (v == NULL)
14330 return -1;
14331 if (!PyLong_Check(v)) {
14332 PyErr_SetString(PyExc_TypeError,
14333 "* wants int");
14334 return -1;
14335 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014336 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014337 if (arg->width == -1 && PyErr_Occurred())
14338 return -1;
14339 if (arg->width < 0) {
14340 arg->flags |= F_LJUST;
14341 arg->width = -arg->width;
14342 }
14343 if (--ctx->fmtcnt >= 0) {
14344 arg->ch = FORMAT_READ(ctx);
14345 ctx->fmtpos++;
14346 }
14347 }
14348 else if (arg->ch >= '0' && arg->ch <= '9') {
14349 arg->width = arg->ch - '0';
14350 while (--ctx->fmtcnt >= 0) {
14351 arg->ch = FORMAT_READ(ctx);
14352 ctx->fmtpos++;
14353 if (arg->ch < '0' || arg->ch > '9')
14354 break;
14355 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14356 mixing signed and unsigned comparison. Since arg->ch is between
14357 '0' and '9', casting to int is safe. */
14358 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14359 PyErr_SetString(PyExc_ValueError,
14360 "width too big");
14361 return -1;
14362 }
14363 arg->width = arg->width*10 + (arg->ch - '0');
14364 }
14365 }
14366
14367 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014368 if (arg->ch == '.') {
14369 arg->prec = 0;
14370 if (--ctx->fmtcnt >= 0) {
14371 arg->ch = FORMAT_READ(ctx);
14372 ctx->fmtpos++;
14373 }
14374 if (arg->ch == '*') {
14375 v = unicode_format_getnextarg(ctx);
14376 if (v == NULL)
14377 return -1;
14378 if (!PyLong_Check(v)) {
14379 PyErr_SetString(PyExc_TypeError,
14380 "* wants int");
14381 return -1;
14382 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014383 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014384 if (arg->prec == -1 && PyErr_Occurred())
14385 return -1;
14386 if (arg->prec < 0)
14387 arg->prec = 0;
14388 if (--ctx->fmtcnt >= 0) {
14389 arg->ch = FORMAT_READ(ctx);
14390 ctx->fmtpos++;
14391 }
14392 }
14393 else if (arg->ch >= '0' && arg->ch <= '9') {
14394 arg->prec = arg->ch - '0';
14395 while (--ctx->fmtcnt >= 0) {
14396 arg->ch = FORMAT_READ(ctx);
14397 ctx->fmtpos++;
14398 if (arg->ch < '0' || arg->ch > '9')
14399 break;
14400 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14401 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014402 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014403 return -1;
14404 }
14405 arg->prec = arg->prec*10 + (arg->ch - '0');
14406 }
14407 }
14408 }
14409
14410 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14411 if (ctx->fmtcnt >= 0) {
14412 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14413 if (--ctx->fmtcnt >= 0) {
14414 arg->ch = FORMAT_READ(ctx);
14415 ctx->fmtpos++;
14416 }
14417 }
14418 }
14419 if (ctx->fmtcnt < 0) {
14420 PyErr_SetString(PyExc_ValueError,
14421 "incomplete format");
14422 return -1;
14423 }
14424 return 0;
14425
14426#undef FORMAT_READ
14427}
14428
14429/* Format one argument. Supported conversion specifiers:
14430
14431 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014432 - "i", "d", "u": int or float
14433 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014434 - "e", "E", "f", "F", "g", "G": float
14435 - "c": int or str (1 character)
14436
Victor Stinner8dbd4212012-12-04 09:30:24 +010014437 When possible, the output is written directly into the Unicode writer
14438 (ctx->writer). A string is created when padding is required.
14439
Victor Stinnera47082312012-10-04 02:19:54 +020014440 Return 0 if the argument has been formatted into *p_str,
14441 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014442 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014443static int
14444unicode_format_arg_format(struct unicode_formatter_t *ctx,
14445 struct unicode_format_arg_t *arg,
14446 PyObject **p_str)
14447{
14448 PyObject *v;
14449 _PyUnicodeWriter *writer = &ctx->writer;
14450
14451 if (ctx->fmtcnt == 0)
14452 ctx->writer.overallocate = 0;
14453
14454 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014455 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014456 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014457 return 1;
14458 }
14459
14460 v = unicode_format_getnextarg(ctx);
14461 if (v == NULL)
14462 return -1;
14463
Victor Stinnera47082312012-10-04 02:19:54 +020014464
14465 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014466 case 's':
14467 case 'r':
14468 case 'a':
14469 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14470 /* Fast path */
14471 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14472 return -1;
14473 return 1;
14474 }
14475
14476 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14477 *p_str = v;
14478 Py_INCREF(*p_str);
14479 }
14480 else {
14481 if (arg->ch == 's')
14482 *p_str = PyObject_Str(v);
14483 else if (arg->ch == 'r')
14484 *p_str = PyObject_Repr(v);
14485 else
14486 *p_str = PyObject_ASCII(v);
14487 }
14488 break;
14489
14490 case 'i':
14491 case 'd':
14492 case 'u':
14493 case 'o':
14494 case 'x':
14495 case 'X':
14496 {
14497 int ret = mainformatlong(v, arg, p_str, writer);
14498 if (ret != 0)
14499 return ret;
14500 arg->sign = 1;
14501 break;
14502 }
14503
14504 case 'e':
14505 case 'E':
14506 case 'f':
14507 case 'F':
14508 case 'g':
14509 case 'G':
14510 if (arg->width == -1 && arg->prec == -1
14511 && !(arg->flags & (F_SIGN | F_BLANK)))
14512 {
14513 /* Fast path */
14514 if (formatfloat(v, arg, NULL, writer) == -1)
14515 return -1;
14516 return 1;
14517 }
14518
14519 arg->sign = 1;
14520 if (formatfloat(v, arg, p_str, NULL) == -1)
14521 return -1;
14522 break;
14523
14524 case 'c':
14525 {
14526 Py_UCS4 ch = formatchar(v);
14527 if (ch == (Py_UCS4) -1)
14528 return -1;
14529 if (arg->width == -1 && arg->prec == -1) {
14530 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014531 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014532 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014533 return 1;
14534 }
14535 *p_str = PyUnicode_FromOrdinal(ch);
14536 break;
14537 }
14538
14539 default:
14540 PyErr_Format(PyExc_ValueError,
14541 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014542 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014543 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14544 (int)arg->ch,
14545 ctx->fmtpos - 1);
14546 return -1;
14547 }
14548 if (*p_str == NULL)
14549 return -1;
14550 assert (PyUnicode_Check(*p_str));
14551 return 0;
14552}
14553
14554static int
14555unicode_format_arg_output(struct unicode_formatter_t *ctx,
14556 struct unicode_format_arg_t *arg,
14557 PyObject *str)
14558{
14559 Py_ssize_t len;
14560 enum PyUnicode_Kind kind;
14561 void *pbuf;
14562 Py_ssize_t pindex;
14563 Py_UCS4 signchar;
14564 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014565 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014566 Py_ssize_t sublen;
14567 _PyUnicodeWriter *writer = &ctx->writer;
14568 Py_UCS4 fill;
14569
14570 fill = ' ';
14571 if (arg->sign && arg->flags & F_ZERO)
14572 fill = '0';
14573
14574 if (PyUnicode_READY(str) == -1)
14575 return -1;
14576
14577 len = PyUnicode_GET_LENGTH(str);
14578 if ((arg->width == -1 || arg->width <= len)
14579 && (arg->prec == -1 || arg->prec >= len)
14580 && !(arg->flags & (F_SIGN | F_BLANK)))
14581 {
14582 /* Fast path */
14583 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14584 return -1;
14585 return 0;
14586 }
14587
14588 /* Truncate the string for "s", "r" and "a" formats
14589 if the precision is set */
14590 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14591 if (arg->prec >= 0 && len > arg->prec)
14592 len = arg->prec;
14593 }
14594
14595 /* Adjust sign and width */
14596 kind = PyUnicode_KIND(str);
14597 pbuf = PyUnicode_DATA(str);
14598 pindex = 0;
14599 signchar = '\0';
14600 if (arg->sign) {
14601 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14602 if (ch == '-' || ch == '+') {
14603 signchar = ch;
14604 len--;
14605 pindex++;
14606 }
14607 else if (arg->flags & F_SIGN)
14608 signchar = '+';
14609 else if (arg->flags & F_BLANK)
14610 signchar = ' ';
14611 else
14612 arg->sign = 0;
14613 }
14614 if (arg->width < len)
14615 arg->width = len;
14616
14617 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014618 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014619 if (!(arg->flags & F_LJUST)) {
14620 if (arg->sign) {
14621 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014622 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014623 }
14624 else {
14625 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014626 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014627 }
14628 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014629 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14630 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014631 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014632 }
14633
Victor Stinnera47082312012-10-04 02:19:54 +020014634 buflen = arg->width;
14635 if (arg->sign && len == arg->width)
14636 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014637 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014638 return -1;
14639
14640 /* Write the sign if needed */
14641 if (arg->sign) {
14642 if (fill != ' ') {
14643 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14644 writer->pos += 1;
14645 }
14646 if (arg->width > len)
14647 arg->width--;
14648 }
14649
14650 /* Write the numeric prefix for "x", "X" and "o" formats
14651 if the alternate form is used.
14652 For example, write "0x" for the "%#x" format. */
14653 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14654 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14655 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14656 if (fill != ' ') {
14657 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14658 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14659 writer->pos += 2;
14660 pindex += 2;
14661 }
14662 arg->width -= 2;
14663 if (arg->width < 0)
14664 arg->width = 0;
14665 len -= 2;
14666 }
14667
14668 /* Pad left with the fill character if needed */
14669 if (arg->width > len && !(arg->flags & F_LJUST)) {
14670 sublen = arg->width - len;
14671 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14672 writer->pos += sublen;
14673 arg->width = len;
14674 }
14675
14676 /* If padding with spaces: write sign if needed and/or numeric prefix if
14677 the alternate form is used */
14678 if (fill == ' ') {
14679 if (arg->sign) {
14680 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14681 writer->pos += 1;
14682 }
14683 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14684 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14685 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14686 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14687 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14688 writer->pos += 2;
14689 pindex += 2;
14690 }
14691 }
14692
14693 /* Write characters */
14694 if (len) {
14695 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14696 str, pindex, len);
14697 writer->pos += len;
14698 }
14699
14700 /* Pad right with the fill character if needed */
14701 if (arg->width > len) {
14702 sublen = arg->width - len;
14703 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14704 writer->pos += sublen;
14705 }
14706 return 0;
14707}
14708
14709/* Helper of PyUnicode_Format(): format one arg.
14710 Return 0 on success, raise an exception and return -1 on error. */
14711static int
14712unicode_format_arg(struct unicode_formatter_t *ctx)
14713{
14714 struct unicode_format_arg_t arg;
14715 PyObject *str;
14716 int ret;
14717
Victor Stinner8dbd4212012-12-04 09:30:24 +010014718 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14719 arg.flags = 0;
14720 arg.width = -1;
14721 arg.prec = -1;
14722 arg.sign = 0;
14723 str = NULL;
14724
Victor Stinnera47082312012-10-04 02:19:54 +020014725 ret = unicode_format_arg_parse(ctx, &arg);
14726 if (ret == -1)
14727 return -1;
14728
14729 ret = unicode_format_arg_format(ctx, &arg, &str);
14730 if (ret == -1)
14731 return -1;
14732
14733 if (ret != 1) {
14734 ret = unicode_format_arg_output(ctx, &arg, str);
14735 Py_DECREF(str);
14736 if (ret == -1)
14737 return -1;
14738 }
14739
14740 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14741 PyErr_SetString(PyExc_TypeError,
14742 "not all arguments converted during string formatting");
14743 return -1;
14744 }
14745 return 0;
14746}
14747
Alexander Belopolsky40018472011-02-26 01:02:56 +000014748PyObject *
14749PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014750{
Victor Stinnera47082312012-10-04 02:19:54 +020014751 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014752
Guido van Rossumd57fd912000-03-10 22:53:23 +000014753 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014754 PyErr_BadInternalCall();
14755 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014756 }
Victor Stinnera47082312012-10-04 02:19:54 +020014757
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014758 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014759 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014760
14761 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014762 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14763 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14764 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14765 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014766
Victor Stinner8f674cc2013-04-17 23:02:17 +020014767 _PyUnicodeWriter_Init(&ctx.writer);
14768 ctx.writer.min_length = ctx.fmtcnt + 100;
14769 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014770
Guido van Rossumd57fd912000-03-10 22:53:23 +000014771 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014772 ctx.arglen = PyTuple_Size(args);
14773 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014774 }
14775 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014776 ctx.arglen = -1;
14777 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014778 }
Victor Stinnera47082312012-10-04 02:19:54 +020014779 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014780 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014781 ctx.dict = args;
14782 else
14783 ctx.dict = NULL;
14784 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014785
Victor Stinnera47082312012-10-04 02:19:54 +020014786 while (--ctx.fmtcnt >= 0) {
14787 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014788 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014789
14790 nonfmtpos = ctx.fmtpos++;
14791 while (ctx.fmtcnt >= 0 &&
14792 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14793 ctx.fmtpos++;
14794 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014795 }
Victor Stinnera47082312012-10-04 02:19:54 +020014796 if (ctx.fmtcnt < 0) {
14797 ctx.fmtpos--;
14798 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014799 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014800
Victor Stinnercfc4c132013-04-03 01:48:39 +020014801 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14802 nonfmtpos, ctx.fmtpos) < 0)
14803 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014804 }
14805 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014806 ctx.fmtpos++;
14807 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014808 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014809 }
14810 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014811
Victor Stinnera47082312012-10-04 02:19:54 +020014812 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014813 PyErr_SetString(PyExc_TypeError,
14814 "not all arguments converted during string formatting");
14815 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014816 }
14817
Victor Stinnera47082312012-10-04 02:19:54 +020014818 if (ctx.args_owned) {
14819 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014820 }
Victor Stinnera47082312012-10-04 02:19:54 +020014821 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014822
Benjamin Peterson29060642009-01-31 22:14:21 +000014823 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014824 _PyUnicodeWriter_Dealloc(&ctx.writer);
14825 if (ctx.args_owned) {
14826 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014827 }
14828 return NULL;
14829}
14830
Jeremy Hylton938ace62002-07-17 16:30:39 +000014831static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014832unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14833
Tim Peters6d6c1a32001-08-02 04:15:00 +000014834static PyObject *
14835unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14836{
Benjamin Peterson29060642009-01-31 22:14:21 +000014837 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014838 static char *kwlist[] = {"object", "encoding", "errors", 0};
14839 char *encoding = NULL;
14840 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014841
Benjamin Peterson14339b62009-01-31 16:36:08 +000014842 if (type != &PyUnicode_Type)
14843 return unicode_subtype_new(type, args, kwds);
14844 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014845 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014846 return NULL;
14847 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014848 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014849 if (encoding == NULL && errors == NULL)
14850 return PyObject_Str(x);
14851 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014852 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014853}
14854
Guido van Rossume023fe02001-08-30 03:12:59 +000014855static PyObject *
14856unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14857{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014858 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014859 Py_ssize_t length, char_size;
14860 int share_wstr, share_utf8;
14861 unsigned int kind;
14862 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014863
Benjamin Peterson14339b62009-01-31 16:36:08 +000014864 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014865
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014866 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014867 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014868 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014869 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014870 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014871 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014872 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014873 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014874
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014875 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014876 if (self == NULL) {
14877 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014878 return NULL;
14879 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014880 kind = PyUnicode_KIND(unicode);
14881 length = PyUnicode_GET_LENGTH(unicode);
14882
14883 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014884#ifdef Py_DEBUG
14885 _PyUnicode_HASH(self) = -1;
14886#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014887 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014888#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014889 _PyUnicode_STATE(self).interned = 0;
14890 _PyUnicode_STATE(self).kind = kind;
14891 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014892 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014893 _PyUnicode_STATE(self).ready = 1;
14894 _PyUnicode_WSTR(self) = NULL;
14895 _PyUnicode_UTF8_LENGTH(self) = 0;
14896 _PyUnicode_UTF8(self) = NULL;
14897 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014898 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014899
14900 share_utf8 = 0;
14901 share_wstr = 0;
14902 if (kind == PyUnicode_1BYTE_KIND) {
14903 char_size = 1;
14904 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14905 share_utf8 = 1;
14906 }
14907 else if (kind == PyUnicode_2BYTE_KIND) {
14908 char_size = 2;
14909 if (sizeof(wchar_t) == 2)
14910 share_wstr = 1;
14911 }
14912 else {
14913 assert(kind == PyUnicode_4BYTE_KIND);
14914 char_size = 4;
14915 if (sizeof(wchar_t) == 4)
14916 share_wstr = 1;
14917 }
14918
14919 /* Ensure we won't overflow the length. */
14920 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14921 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014922 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014923 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014924 data = PyObject_MALLOC((length + 1) * char_size);
14925 if (data == NULL) {
14926 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014927 goto onError;
14928 }
14929
Victor Stinnerc3c74152011-10-02 20:39:55 +020014930 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014931 if (share_utf8) {
14932 _PyUnicode_UTF8_LENGTH(self) = length;
14933 _PyUnicode_UTF8(self) = data;
14934 }
14935 if (share_wstr) {
14936 _PyUnicode_WSTR_LENGTH(self) = length;
14937 _PyUnicode_WSTR(self) = (wchar_t *)data;
14938 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014939
Christian Heimesf051e432016-09-13 20:22:02 +020014940 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014941 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014942 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014943#ifdef Py_DEBUG
14944 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14945#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014946 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014947 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014948
14949onError:
14950 Py_DECREF(unicode);
14951 Py_DECREF(self);
14952 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014953}
14954
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014955PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014956"str(object='') -> str\n\
14957str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014958\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014959Create a new string object from the given object. If encoding or\n\
14960errors is specified, then the object must expose a data buffer\n\
14961that will be decoded using the given encoding and error handler.\n\
14962Otherwise, returns the result of object.__str__() (if defined)\n\
14963or repr(object).\n\
14964encoding defaults to sys.getdefaultencoding().\n\
14965errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014966
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014967static PyObject *unicode_iter(PyObject *seq);
14968
Guido van Rossumd57fd912000-03-10 22:53:23 +000014969PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014970 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014971 "str", /* tp_name */
14972 sizeof(PyUnicodeObject), /* tp_size */
14973 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014974 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014975 (destructor)unicode_dealloc, /* tp_dealloc */
14976 0, /* tp_print */
14977 0, /* tp_getattr */
14978 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014979 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014980 unicode_repr, /* tp_repr */
14981 &unicode_as_number, /* tp_as_number */
14982 &unicode_as_sequence, /* tp_as_sequence */
14983 &unicode_as_mapping, /* tp_as_mapping */
14984 (hashfunc) unicode_hash, /* tp_hash*/
14985 0, /* tp_call*/
14986 (reprfunc) unicode_str, /* tp_str */
14987 PyObject_GenericGetAttr, /* tp_getattro */
14988 0, /* tp_setattro */
14989 0, /* tp_as_buffer */
14990 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014991 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014992 unicode_doc, /* tp_doc */
14993 0, /* tp_traverse */
14994 0, /* tp_clear */
14995 PyUnicode_RichCompare, /* tp_richcompare */
14996 0, /* tp_weaklistoffset */
14997 unicode_iter, /* tp_iter */
14998 0, /* tp_iternext */
14999 unicode_methods, /* tp_methods */
15000 0, /* tp_members */
15001 0, /* tp_getset */
15002 &PyBaseObject_Type, /* tp_base */
15003 0, /* tp_dict */
15004 0, /* tp_descr_get */
15005 0, /* tp_descr_set */
15006 0, /* tp_dictoffset */
15007 0, /* tp_init */
15008 0, /* tp_alloc */
15009 unicode_new, /* tp_new */
15010 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015011};
15012
15013/* Initialize the Unicode implementation */
15014
Victor Stinner3a50e702011-10-18 21:21:00 +020015015int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015016{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015017 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015018 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015019 0x000A, /* LINE FEED */
15020 0x000D, /* CARRIAGE RETURN */
15021 0x001C, /* FILE SEPARATOR */
15022 0x001D, /* GROUP SEPARATOR */
15023 0x001E, /* RECORD SEPARATOR */
15024 0x0085, /* NEXT LINE */
15025 0x2028, /* LINE SEPARATOR */
15026 0x2029, /* PARAGRAPH SEPARATOR */
15027 };
15028
Fred Drakee4315f52000-05-09 19:53:39 +000015029 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015030 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015031 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015032 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015033 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015034
Guido van Rossumcacfc072002-05-24 19:01:59 +000015035 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015036 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015037
15038 /* initialize the linebreak bloom filter */
15039 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015040 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015041 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015042
Christian Heimes26532f72013-07-20 14:57:16 +020015043 if (PyType_Ready(&EncodingMapType) < 0)
15044 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015045
Benjamin Petersonc4311282012-10-30 23:21:10 -040015046 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15047 Py_FatalError("Can't initialize field name iterator type");
15048
15049 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15050 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015051
Victor Stinner3a50e702011-10-18 21:21:00 +020015052 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015053}
15054
15055/* Finalize the Unicode implementation */
15056
Christian Heimesa156e092008-02-16 07:38:31 +000015057int
15058PyUnicode_ClearFreeList(void)
15059{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015060 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015061}
15062
Guido van Rossumd57fd912000-03-10 22:53:23 +000015063void
Thomas Wouters78890102000-07-22 19:25:51 +000015064_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015065{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015066 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015067
Serhiy Storchaka05997252013-01-26 12:14:02 +020015068 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015069
Serhiy Storchaka05997252013-01-26 12:14:02 +020015070 for (i = 0; i < 256; i++)
15071 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015072 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015073 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015074}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015075
Walter Dörwald16807132007-05-25 13:52:07 +000015076void
15077PyUnicode_InternInPlace(PyObject **p)
15078{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015079 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015080 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015081#ifdef Py_DEBUG
15082 assert(s != NULL);
15083 assert(_PyUnicode_CHECK(s));
15084#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015085 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015086 return;
15087#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015088 /* If it's a subclass, we don't really know what putting
15089 it in the interned dict might do. */
15090 if (!PyUnicode_CheckExact(s))
15091 return;
15092 if (PyUnicode_CHECK_INTERNED(s))
15093 return;
15094 if (interned == NULL) {
15095 interned = PyDict_New();
15096 if (interned == NULL) {
15097 PyErr_Clear(); /* Don't leave an exception */
15098 return;
15099 }
15100 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015101 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015102 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015103 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015104 if (t == NULL) {
15105 PyErr_Clear();
15106 return;
15107 }
15108 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015109 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015110 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015111 return;
15112 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015113 /* The two references in interned are not counted by refcnt.
15114 The deallocator will take care of this */
15115 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015116 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015117}
15118
15119void
15120PyUnicode_InternImmortal(PyObject **p)
15121{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015122 PyUnicode_InternInPlace(p);
15123 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015124 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015125 Py_INCREF(*p);
15126 }
Walter Dörwald16807132007-05-25 13:52:07 +000015127}
15128
15129PyObject *
15130PyUnicode_InternFromString(const char *cp)
15131{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015132 PyObject *s = PyUnicode_FromString(cp);
15133 if (s == NULL)
15134 return NULL;
15135 PyUnicode_InternInPlace(&s);
15136 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015137}
15138
Alexander Belopolsky40018472011-02-26 01:02:56 +000015139void
15140_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015141{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015142 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015143 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015144 Py_ssize_t i, n;
15145 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015146
Benjamin Peterson14339b62009-01-31 16:36:08 +000015147 if (interned == NULL || !PyDict_Check(interned))
15148 return;
15149 keys = PyDict_Keys(interned);
15150 if (keys == NULL || !PyList_Check(keys)) {
15151 PyErr_Clear();
15152 return;
15153 }
Walter Dörwald16807132007-05-25 13:52:07 +000015154
Benjamin Peterson14339b62009-01-31 16:36:08 +000015155 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15156 detector, interned unicode strings are not forcibly deallocated;
15157 rather, we give them their stolen references back, and then clear
15158 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015159
Benjamin Peterson14339b62009-01-31 16:36:08 +000015160 n = PyList_GET_SIZE(keys);
15161 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015162 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015163 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015164 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015165 if (PyUnicode_READY(s) == -1) {
15166 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015167 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015168 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015169 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015170 case SSTATE_NOT_INTERNED:
15171 /* XXX Shouldn't happen */
15172 break;
15173 case SSTATE_INTERNED_IMMORTAL:
15174 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015175 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015176 break;
15177 case SSTATE_INTERNED_MORTAL:
15178 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015179 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015180 break;
15181 default:
15182 Py_FatalError("Inconsistent interned string state.");
15183 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015184 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015185 }
15186 fprintf(stderr, "total size of all interned strings: "
15187 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15188 "mortal/immortal\n", mortal_size, immortal_size);
15189 Py_DECREF(keys);
15190 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015191 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015192}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015193
15194
15195/********************* Unicode Iterator **************************/
15196
15197typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015198 PyObject_HEAD
15199 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015200 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015201} unicodeiterobject;
15202
15203static void
15204unicodeiter_dealloc(unicodeiterobject *it)
15205{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015206 _PyObject_GC_UNTRACK(it);
15207 Py_XDECREF(it->it_seq);
15208 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015209}
15210
15211static int
15212unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15213{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015214 Py_VISIT(it->it_seq);
15215 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015216}
15217
15218static PyObject *
15219unicodeiter_next(unicodeiterobject *it)
15220{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015221 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015222
Benjamin Peterson14339b62009-01-31 16:36:08 +000015223 assert(it != NULL);
15224 seq = it->it_seq;
15225 if (seq == NULL)
15226 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015227 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015229 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15230 int kind = PyUnicode_KIND(seq);
15231 void *data = PyUnicode_DATA(seq);
15232 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15233 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015234 if (item != NULL)
15235 ++it->it_index;
15236 return item;
15237 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015238
Benjamin Peterson14339b62009-01-31 16:36:08 +000015239 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015240 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015241 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015242}
15243
15244static PyObject *
15245unicodeiter_len(unicodeiterobject *it)
15246{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015247 Py_ssize_t len = 0;
15248 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015249 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015250 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015251}
15252
15253PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15254
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015255static PyObject *
15256unicodeiter_reduce(unicodeiterobject *it)
15257{
15258 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015259 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015260 it->it_seq, it->it_index);
15261 } else {
15262 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15263 if (u == NULL)
15264 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015265 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015266 }
15267}
15268
15269PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15270
15271static PyObject *
15272unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15273{
15274 Py_ssize_t index = PyLong_AsSsize_t(state);
15275 if (index == -1 && PyErr_Occurred())
15276 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015277 if (it->it_seq != NULL) {
15278 if (index < 0)
15279 index = 0;
15280 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15281 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15282 it->it_index = index;
15283 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015284 Py_RETURN_NONE;
15285}
15286
15287PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15288
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015289static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015290 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015291 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015292 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15293 reduce_doc},
15294 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15295 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015296 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015297};
15298
15299PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015300 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15301 "str_iterator", /* tp_name */
15302 sizeof(unicodeiterobject), /* tp_basicsize */
15303 0, /* tp_itemsize */
15304 /* methods */
15305 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15306 0, /* tp_print */
15307 0, /* tp_getattr */
15308 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015309 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015310 0, /* tp_repr */
15311 0, /* tp_as_number */
15312 0, /* tp_as_sequence */
15313 0, /* tp_as_mapping */
15314 0, /* tp_hash */
15315 0, /* tp_call */
15316 0, /* tp_str */
15317 PyObject_GenericGetAttr, /* tp_getattro */
15318 0, /* tp_setattro */
15319 0, /* tp_as_buffer */
15320 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15321 0, /* tp_doc */
15322 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15323 0, /* tp_clear */
15324 0, /* tp_richcompare */
15325 0, /* tp_weaklistoffset */
15326 PyObject_SelfIter, /* tp_iter */
15327 (iternextfunc)unicodeiter_next, /* tp_iternext */
15328 unicodeiter_methods, /* tp_methods */
15329 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015330};
15331
15332static PyObject *
15333unicode_iter(PyObject *seq)
15334{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015335 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015336
Benjamin Peterson14339b62009-01-31 16:36:08 +000015337 if (!PyUnicode_Check(seq)) {
15338 PyErr_BadInternalCall();
15339 return NULL;
15340 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015341 if (PyUnicode_READY(seq) == -1)
15342 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015343 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15344 if (it == NULL)
15345 return NULL;
15346 it->it_index = 0;
15347 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015348 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015349 _PyObject_GC_TRACK(it);
15350 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015351}
15352
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015353
15354size_t
15355Py_UNICODE_strlen(const Py_UNICODE *u)
15356{
15357 int res = 0;
15358 while(*u++)
15359 res++;
15360 return res;
15361}
15362
15363Py_UNICODE*
15364Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15365{
15366 Py_UNICODE *u = s1;
15367 while ((*u++ = *s2++));
15368 return s1;
15369}
15370
15371Py_UNICODE*
15372Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15373{
15374 Py_UNICODE *u = s1;
15375 while ((*u++ = *s2++))
15376 if (n-- == 0)
15377 break;
15378 return s1;
15379}
15380
15381Py_UNICODE*
15382Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15383{
15384 Py_UNICODE *u1 = s1;
15385 u1 += Py_UNICODE_strlen(u1);
15386 Py_UNICODE_strcpy(u1, s2);
15387 return s1;
15388}
15389
15390int
15391Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15392{
15393 while (*s1 && *s2 && *s1 == *s2)
15394 s1++, s2++;
15395 if (*s1 && *s2)
15396 return (*s1 < *s2) ? -1 : +1;
15397 if (*s1)
15398 return 1;
15399 if (*s2)
15400 return -1;
15401 return 0;
15402}
15403
15404int
15405Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15406{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015407 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015408 for (; n != 0; n--) {
15409 u1 = *s1;
15410 u2 = *s2;
15411 if (u1 != u2)
15412 return (u1 < u2) ? -1 : +1;
15413 if (u1 == '\0')
15414 return 0;
15415 s1++;
15416 s2++;
15417 }
15418 return 0;
15419}
15420
15421Py_UNICODE*
15422Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15423{
15424 const Py_UNICODE *p;
15425 for (p = s; *p; p++)
15426 if (*p == c)
15427 return (Py_UNICODE*)p;
15428 return NULL;
15429}
15430
15431Py_UNICODE*
15432Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15433{
15434 const Py_UNICODE *p;
15435 p = s + Py_UNICODE_strlen(s);
15436 while (p != s) {
15437 p--;
15438 if (*p == c)
15439 return (Py_UNICODE*)p;
15440 }
15441 return NULL;
15442}
Victor Stinner331ea922010-08-10 16:37:20 +000015443
Victor Stinner71133ff2010-09-01 23:43:53 +000015444Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015445PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015446{
Victor Stinner577db2c2011-10-11 22:12:48 +020015447 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015448 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015450 if (!PyUnicode_Check(unicode)) {
15451 PyErr_BadArgument();
15452 return NULL;
15453 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015454 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015455 if (u == NULL)
15456 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015457 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015458 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015459 PyErr_NoMemory();
15460 return NULL;
15461 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015462 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015463 size *= sizeof(Py_UNICODE);
15464 copy = PyMem_Malloc(size);
15465 if (copy == NULL) {
15466 PyErr_NoMemory();
15467 return NULL;
15468 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015469 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015470 return copy;
15471}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015472
Georg Brandl66c221e2010-10-14 07:04:07 +000015473/* A _string module, to export formatter_parser and formatter_field_name_split
15474 to the string.Formatter class implemented in Python. */
15475
15476static PyMethodDef _string_methods[] = {
15477 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15478 METH_O, PyDoc_STR("split the argument as a field name")},
15479 {"formatter_parser", (PyCFunction) formatter_parser,
15480 METH_O, PyDoc_STR("parse the argument as a format string")},
15481 {NULL, NULL}
15482};
15483
15484static struct PyModuleDef _string_module = {
15485 PyModuleDef_HEAD_INIT,
15486 "_string",
15487 PyDoc_STR("string helper module"),
15488 0,
15489 _string_methods,
15490 NULL,
15491 NULL,
15492 NULL,
15493 NULL
15494};
15495
15496PyMODINIT_FUNC
15497PyInit__string(void)
15498{
15499 return PyModule_Create(&_string_module);
15500}
15501
15502
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015503#ifdef __cplusplus
15504}
15505#endif