blob: 3553aaf4ade2877b226b99e12e64aaf50bbe8af5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700207static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200321 }
322 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200323 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 }
325 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200326 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200327 }
328 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200329 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200330 }
331 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200332 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200333 }
334 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200335 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 }
337 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200338 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200339 }
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_OTHER;
341}
342
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300343/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000345Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000346PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000348#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 /* This is actually an illegal character, so it should
352 not be passed to unichr. */
353 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354#endif
355}
356
Victor Stinner910337b2011-10-03 03:20:16 +0200357#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200358int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100359_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200360{
361 PyASCIIObject *ascii;
362 unsigned int kind;
363
364 assert(PyUnicode_Check(op));
365
366 ascii = (PyASCIIObject *)op;
367 kind = ascii->state.kind;
368
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200370 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 assert(ascii->state.ready == 1);
372 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200375 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200376
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 if (ascii->state.compact == 1) {
378 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200379 assert(kind == PyUnicode_1BYTE_KIND
380 || kind == PyUnicode_2BYTE_KIND
381 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200384 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100385 }
386 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389 data = unicode->data.any;
390 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100391 assert(ascii->length == 0);
392 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 assert(ascii->state.compact == 0);
394 assert(ascii->state.ascii == 0);
395 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200397 assert(ascii->wstr != NULL);
398 assert(data == NULL);
399 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 }
401 else {
402 assert(kind == PyUnicode_1BYTE_KIND
403 || kind == PyUnicode_2BYTE_KIND
404 || kind == PyUnicode_4BYTE_KIND);
405 assert(ascii->state.compact == 0);
406 assert(ascii->state.ready == 1);
407 assert(data != NULL);
408 if (ascii->state.ascii) {
409 assert (compact->utf8 == data);
410 assert (compact->utf8_length == ascii->length);
411 }
412 else
413 assert (compact->utf8 != data);
414 }
415 }
416 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200417 if (
418#if SIZEOF_WCHAR_T == 2
419 kind == PyUnicode_2BYTE_KIND
420#else
421 kind == PyUnicode_4BYTE_KIND
422#endif
423 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 {
425 assert(ascii->wstr == data);
426 assert(compact->wstr_length == ascii->length);
427 } else
428 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200429 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200430
431 if (compact->utf8 == NULL)
432 assert(compact->utf8_length == 0);
433 if (ascii->wstr == NULL)
434 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200435 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200436 /* check that the best kind is used */
437 if (check_content && kind != PyUnicode_WCHAR_KIND)
438 {
439 Py_ssize_t i;
440 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200441 void *data;
442 Py_UCS4 ch;
443
444 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 for (i=0; i < ascii->length; i++)
446 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200447 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200448 if (ch > maxchar)
449 maxchar = ch;
450 }
451 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100452 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 255);
455 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200456 else
457 assert(maxchar < 128);
458 }
Victor Stinner77faf692011-11-20 18:56:05 +0100459 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200460 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100461 assert(maxchar <= 0xFFFF);
462 }
463 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100465 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200467 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200468 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400469 return 1;
470}
Victor Stinner910337b2011-10-03 03:20:16 +0200471#endif
472
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100473static PyObject*
474unicode_result_wchar(PyObject *unicode)
475{
476#ifndef Py_DEBUG
477 Py_ssize_t len;
478
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100479 len = _PyUnicode_WSTR_LENGTH(unicode);
480 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200482 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100483 }
484
485 if (len == 1) {
486 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100487 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489 Py_DECREF(unicode);
490 return latin1_char;
491 }
492 }
493
494 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200495 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 return NULL;
497 }
498#else
Victor Stinneraa771272012-10-04 02:32:58 +0200499 assert(Py_REFCNT(unicode) == 1);
500
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 /* don't make the result ready in debug mode to ensure that the caller
502 makes the string ready before using it */
503 assert(_PyUnicode_CheckConsistency(unicode, 1));
504#endif
505 return unicode;
506}
507
508static PyObject*
509unicode_result_ready(PyObject *unicode)
510{
511 Py_ssize_t length;
512
513 length = PyUnicode_GET_LENGTH(unicode);
514 if (length == 0) {
515 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100516 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200517 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 }
519 return unicode_empty;
520 }
521
522 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200523 void *data = PyUnicode_DATA(unicode);
524 int kind = PyUnicode_KIND(unicode);
525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 if (ch < 256) {
527 PyObject *latin1_char = unicode_latin1[ch];
528 if (latin1_char != NULL) {
529 if (unicode != latin1_char) {
530 Py_INCREF(latin1_char);
531 Py_DECREF(unicode);
532 }
533 return latin1_char;
534 }
535 else {
536 assert(_PyUnicode_CheckConsistency(unicode, 1));
537 Py_INCREF(unicode);
538 unicode_latin1[ch] = unicode;
539 return unicode;
540 }
541 }
542 }
543
544 assert(_PyUnicode_CheckConsistency(unicode, 1));
545 return unicode;
546}
547
548static PyObject*
549unicode_result(PyObject *unicode)
550{
551 assert(_PyUnicode_CHECK(unicode));
552 if (PyUnicode_IS_READY(unicode))
553 return unicode_result_ready(unicode);
554 else
555 return unicode_result_wchar(unicode);
556}
557
Victor Stinnerc4b49542011-12-11 22:44:26 +0100558static PyObject*
559unicode_result_unchanged(PyObject *unicode)
560{
561 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500562 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563 return NULL;
564 Py_INCREF(unicode);
565 return unicode;
566 }
567 else
568 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100569 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100570}
571
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573 ASCII, Latin1, UTF-8, etc. */
574static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200575backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577{
Victor Stinnerad771582015-10-09 12:38:53 +0200578 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200579 Py_UCS4 ch;
580 enum PyUnicode_Kind kind;
581 void *data;
582
583 assert(PyUnicode_IS_READY(unicode));
584 kind = PyUnicode_KIND(unicode);
585 data = PyUnicode_DATA(unicode);
586
587 size = 0;
588 /* determine replacement size */
589 for (i = collstart; i < collend; ++i) {
590 Py_ssize_t incr;
591
592 ch = PyUnicode_READ(kind, data, i);
593 if (ch < 0x100)
594 incr = 2+2;
595 else if (ch < 0x10000)
596 incr = 2+4;
597 else {
598 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200599 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200600 }
601 if (size > PY_SSIZE_T_MAX - incr) {
602 PyErr_SetString(PyExc_OverflowError,
603 "encoded result is too long for a Python string");
604 return NULL;
605 }
606 size += incr;
607 }
608
Victor Stinnerad771582015-10-09 12:38:53 +0200609 str = _PyBytesWriter_Prepare(writer, str, size);
610 if (str == NULL)
611 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200612
613 /* generate replacement */
614 for (i = collstart; i < collend; ++i) {
615 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200616 *str++ = '\\';
617 if (ch >= 0x00010000) {
618 *str++ = 'U';
619 *str++ = Py_hexdigits[(ch>>28)&0xf];
620 *str++ = Py_hexdigits[(ch>>24)&0xf];
621 *str++ = Py_hexdigits[(ch>>20)&0xf];
622 *str++ = Py_hexdigits[(ch>>16)&0xf];
623 *str++ = Py_hexdigits[(ch>>12)&0xf];
624 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 }
Victor Stinner797485e2015-10-09 03:17:30 +0200626 else if (ch >= 0x100) {
627 *str++ = 'u';
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
630 }
631 else
632 *str++ = 'x';
633 *str++ = Py_hexdigits[(ch>>4)&0xf];
634 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200635 }
636 return str;
637}
638
639/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640 ASCII, Latin1, UTF-8, etc. */
641static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200642xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644{
Victor Stinnerad771582015-10-09 12:38:53 +0200645 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 Py_UCS4 ch;
647 enum PyUnicode_Kind kind;
648 void *data;
649
650 assert(PyUnicode_IS_READY(unicode));
651 kind = PyUnicode_KIND(unicode);
652 data = PyUnicode_DATA(unicode);
653
654 size = 0;
655 /* determine replacement size */
656 for (i = collstart; i < collend; ++i) {
657 Py_ssize_t incr;
658
659 ch = PyUnicode_READ(kind, data, i);
660 if (ch < 10)
661 incr = 2+1+1;
662 else if (ch < 100)
663 incr = 2+2+1;
664 else if (ch < 1000)
665 incr = 2+3+1;
666 else if (ch < 10000)
667 incr = 2+4+1;
668 else if (ch < 100000)
669 incr = 2+5+1;
670 else if (ch < 1000000)
671 incr = 2+6+1;
672 else {
673 assert(ch <= MAX_UNICODE);
674 incr = 2+7+1;
675 }
676 if (size > PY_SSIZE_T_MAX - incr) {
677 PyErr_SetString(PyExc_OverflowError,
678 "encoded result is too long for a Python string");
679 return NULL;
680 }
681 size += incr;
682 }
683
Victor Stinnerad771582015-10-09 12:38:53 +0200684 str = _PyBytesWriter_Prepare(writer, str, size);
685 if (str == NULL)
686 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200687
688 /* generate replacement */
689 for (i = collstart; i < collend; ++i) {
690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691 }
692 return str;
693}
694
Thomas Wouters477c8d52006-05-27 19:21:47 +0000695/* --- Bloom Filters ----------------------------------------------------- */
696
697/* stuff to implement simple "bloom filters" for Unicode characters.
698 to keep things simple, we use a single bitmask, using the least 5
699 bits from each unicode characters as the bit index. */
700
701/* the linebreak mask is set up by Unicode_Init below */
702
Antoine Pitrouf068f942010-01-13 14:19:12 +0000703#if LONG_BIT >= 128
704#define BLOOM_WIDTH 128
705#elif LONG_BIT >= 64
706#define BLOOM_WIDTH 64
707#elif LONG_BIT >= 32
708#define BLOOM_WIDTH 32
709#else
710#error "LONG_BIT is smaller than 32"
711#endif
712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713#define BLOOM_MASK unsigned long
714
Serhiy Storchaka05997252013-01-26 12:14:02 +0200715static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000716
Antoine Pitrouf068f942010-01-13 14:19:12 +0000717#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718
Benjamin Peterson29060642009-01-31 22:14:21 +0000719#define BLOOM_LINEBREAK(ch) \
720 ((ch) < 128U ? ascii_linebreak[(ch)] : \
721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000722
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700723static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000725{
Victor Stinnera85af502013-04-09 21:53:54 +0200726#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
727 do { \
728 TYPE *data = (TYPE *)PTR; \
729 TYPE *end = data + LEN; \
730 Py_UCS4 ch; \
731 for (; data != end; data++) { \
732 ch = *data; \
733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734 } \
735 break; \
736 } while (0)
737
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738 /* calculate simple bloom-style bitmask for a given unicode string */
739
Antoine Pitrouf068f942010-01-13 14:19:12 +0000740 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741
742 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200743 switch (kind) {
744 case PyUnicode_1BYTE_KIND:
745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746 break;
747 case PyUnicode_2BYTE_KIND:
748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749 break;
750 case PyUnicode_4BYTE_KIND:
751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752 break;
753 default:
754 assert(0);
755 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000756 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200757
758#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759}
760
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300761static int
762ensure_unicode(PyObject *obj)
763{
764 if (!PyUnicode_Check(obj)) {
765 PyErr_Format(PyExc_TypeError,
766 "must be str, not %.100s",
767 Py_TYPE(obj)->tp_name);
768 return -1;
769 }
770 return PyUnicode_READY(obj);
771}
772
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773/* Compilation of templated routines */
774
775#include "stringlib/asciilib.h"
776#include "stringlib/fastsearch.h"
777#include "stringlib/partition.h"
778#include "stringlib/split.h"
779#include "stringlib/count.h"
780#include "stringlib/find.h"
781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs1lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300791#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs2lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs4lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200818#include "stringlib/unicodedefs.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100822#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824/* --- Unicode Object ----------------------------------------------------- */
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200827fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700829static inline Py_ssize_t
830findchar(const void *s, int kind,
831 Py_ssize_t size, Py_UCS4 ch,
832 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834 switch (kind) {
835 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200836 if ((Py_UCS1) ch != ch)
837 return -1;
838 if (direction > 0)
839 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
840 else
841 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200842 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200843 if ((Py_UCS2) ch != ch)
844 return -1;
845 if (direction > 0)
846 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
847 else
848 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200849 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200850 if (direction > 0)
851 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
852 else
853 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200854 default:
855 assert(0);
856 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858}
859
Victor Stinnerafffce42012-10-03 23:03:17 +0200860#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000861/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200862 earlier.
863
864 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
865 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
866 invalid character in Unicode 6.0. */
867static void
868unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
869{
870 int kind = PyUnicode_KIND(unicode);
871 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
872 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
873 if (length <= old_length)
874 return;
875 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
876}
877#endif
878
Victor Stinnerfe226c02011-10-03 03:52:20 +0200879static PyObject*
880resize_compact(PyObject *unicode, Py_ssize_t length)
881{
882 Py_ssize_t char_size;
883 Py_ssize_t struct_size;
884 Py_ssize_t new_size;
885 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100886 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200887#ifdef Py_DEBUG
888 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
889#endif
890
Victor Stinner79891572012-05-03 13:43:07 +0200891 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200892 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100893 assert(PyUnicode_IS_COMPACT(unicode));
894
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200895 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100896 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200897 struct_size = sizeof(PyASCIIObject);
898 else
899 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200900 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901
Victor Stinnerfe226c02011-10-03 03:52:20 +0200902 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
903 PyErr_NoMemory();
904 return NULL;
905 }
906 new_size = (struct_size + (length + 1) * char_size);
907
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200908 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
909 PyObject_DEL(_PyUnicode_UTF8(unicode));
910 _PyUnicode_UTF8(unicode) = NULL;
911 _PyUnicode_UTF8_LENGTH(unicode) = 0;
912 }
Victor Stinner84def372011-12-11 20:04:56 +0100913 _Py_DEC_REFTOTAL;
914 _Py_ForgetReference(unicode);
915
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300916 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100917 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100918 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200919 PyErr_NoMemory();
920 return NULL;
921 }
Victor Stinner84def372011-12-11 20:04:56 +0100922 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200923 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100924
Victor Stinnerfe226c02011-10-03 03:52:20 +0200925 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200926 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200927 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100928 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200929 _PyUnicode_WSTR_LENGTH(unicode) = length;
930 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100931 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
932 PyObject_DEL(_PyUnicode_WSTR(unicode));
933 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100934 if (!PyUnicode_IS_ASCII(unicode))
935 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100936 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200937#ifdef Py_DEBUG
938 unicode_fill_invalid(unicode, old_length);
939#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
941 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200942 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 return unicode;
944}
945
Alexander Belopolsky40018472011-02-26 01:02:56 +0000946static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200947resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948{
Victor Stinner95663112011-10-04 01:03:50 +0200949 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100950 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200952 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000953
Victor Stinnerfe226c02011-10-03 03:52:20 +0200954 if (PyUnicode_IS_READY(unicode)) {
955 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200956 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200958#ifdef Py_DEBUG
959 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
960#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961
962 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200963 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
965 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200966
967 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
968 PyErr_NoMemory();
969 return -1;
970 }
971 new_size = (length + 1) * char_size;
972
Victor Stinner7a9105a2011-12-12 00:13:42 +0100973 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
974 {
975 PyObject_DEL(_PyUnicode_UTF8(unicode));
976 _PyUnicode_UTF8(unicode) = NULL;
977 _PyUnicode_UTF8_LENGTH(unicode) = 0;
978 }
979
Victor Stinnerfe226c02011-10-03 03:52:20 +0200980 data = (PyObject *)PyObject_REALLOC(data, new_size);
981 if (data == NULL) {
982 PyErr_NoMemory();
983 return -1;
984 }
985 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200986 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200987 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200988 _PyUnicode_WSTR_LENGTH(unicode) = length;
989 }
990 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200991 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200992 _PyUnicode_UTF8_LENGTH(unicode) = length;
993 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 _PyUnicode_LENGTH(unicode) = length;
995 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 unicode_fill_invalid(unicode, old_length);
998#endif
Victor Stinner95663112011-10-04 01:03:50 +0200999 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001000 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 }
Victor Stinner95663112011-10-04 01:03:50 +02001004 assert(_PyUnicode_WSTR(unicode) != NULL);
1005
1006 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001007 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001008 PyErr_NoMemory();
1009 return -1;
1010 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001012 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001013 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001014 if (!wstr) {
1015 PyErr_NoMemory();
1016 return -1;
1017 }
1018 _PyUnicode_WSTR(unicode) = wstr;
1019 _PyUnicode_WSTR(unicode)[length] = 0;
1020 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001021 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022 return 0;
1023}
1024
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025static PyObject*
1026resize_copy(PyObject *unicode, Py_ssize_t length)
1027{
1028 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001031
Benjamin Petersonbac79492012-01-14 13:34:47 -05001032 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001033 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001034
1035 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1036 if (copy == NULL)
1037 return NULL;
1038
1039 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001040 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001042 }
1043 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001044 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001046 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 if (w == NULL)
1048 return NULL;
1049 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1050 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001051 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1052 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001053 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 }
1055}
1056
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001058 Ux0000 terminated; some code (e.g. new_identifier)
1059 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060
1061 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001062 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063
1064*/
1065
Alexander Belopolsky40018472011-02-26 01:02:56 +00001066static PyUnicodeObject *
1067_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001069 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001071
Thomas Wouters477c8d52006-05-27 19:21:47 +00001072 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073 if (length == 0 && unicode_empty != NULL) {
1074 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001075 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 }
1077
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001078 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001079 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001080 return (PyUnicodeObject *)PyErr_NoMemory();
1081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 if (length < 0) {
1083 PyErr_SetString(PyExc_SystemError,
1084 "Negative size passed to _PyUnicode_New");
1085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086 }
1087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1089 if (unicode == NULL)
1090 return NULL;
1091 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001092
1093 _PyUnicode_WSTR_LENGTH(unicode) = length;
1094 _PyUnicode_HASH(unicode) = -1;
1095 _PyUnicode_STATE(unicode).interned = 0;
1096 _PyUnicode_STATE(unicode).kind = 0;
1097 _PyUnicode_STATE(unicode).compact = 0;
1098 _PyUnicode_STATE(unicode).ready = 0;
1099 _PyUnicode_STATE(unicode).ascii = 0;
1100 _PyUnicode_DATA_ANY(unicode) = NULL;
1101 _PyUnicode_LENGTH(unicode) = 0;
1102 _PyUnicode_UTF8(unicode) = NULL;
1103 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1106 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001107 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001108 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001109 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001110 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111
Jeremy Hyltond8082792003-09-16 19:41:39 +00001112 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001113 * the caller fails before initializing str -- unicode_resize()
1114 * reads str[0], and the Keep-Alive optimization can keep memory
1115 * allocated for str alive across a call to unicode_dealloc(unicode).
1116 * We don't want unicode_resize to read uninitialized memory in
1117 * that case.
1118 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 _PyUnicode_WSTR(unicode)[0] = 0;
1120 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001121
Victor Stinner7931d9a2011-11-04 00:22:48 +01001122 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 return unicode;
1124}
1125
Victor Stinnerf42dc442011-10-02 23:33:16 +02001126static const char*
1127unicode_kind_name(PyObject *unicode)
1128{
Victor Stinner42dfd712011-10-03 14:41:45 +02001129 /* don't check consistency: unicode_kind_name() is called from
1130 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131 if (!PyUnicode_IS_COMPACT(unicode))
1132 {
1133 if (!PyUnicode_IS_READY(unicode))
1134 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001135 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001136 {
1137 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001138 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001139 return "legacy ascii";
1140 else
1141 return "legacy latin1";
1142 case PyUnicode_2BYTE_KIND:
1143 return "legacy UCS2";
1144 case PyUnicode_4BYTE_KIND:
1145 return "legacy UCS4";
1146 default:
1147 return "<legacy invalid kind>";
1148 }
1149 }
1150 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001151 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "ascii";
1155 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001156 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001157 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001158 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001159 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001160 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001161 default:
1162 return "<invalid compact kind>";
1163 }
1164}
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167/* Functions wrapping macros for use in debugger */
1168char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001169 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170}
1171
1172void *_PyUnicode_compact_data(void *unicode) {
1173 return _PyUnicode_COMPACT_DATA(unicode);
1174}
1175void *_PyUnicode_data(void *unicode){
1176 printf("obj %p\n", unicode);
1177 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1178 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1179 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1180 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1181 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1182 return PyUnicode_DATA(unicode);
1183}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001184
1185void
1186_PyUnicode_Dump(PyObject *op)
1187{
1188 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001189 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1190 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1191 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001192
Victor Stinnera849a4b2011-10-03 12:12:11 +02001193 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001194 {
1195 if (ascii->state.ascii)
1196 data = (ascii + 1);
1197 else
1198 data = (compact + 1);
1199 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001200 else
1201 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001202 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1203 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001204
Victor Stinnera849a4b2011-10-03 12:12:11 +02001205 if (ascii->wstr == data)
1206 printf("shared ");
1207 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001208
Victor Stinnera3b334d2011-10-03 13:53:37 +02001209 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001210 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001211 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1212 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001213 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1214 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001215 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001216 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001217}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218#endif
1219
1220PyObject *
1221PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1222{
1223 PyObject *obj;
1224 PyCompactUnicodeObject *unicode;
1225 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001226 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001227 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 Py_ssize_t char_size;
1229 Py_ssize_t struct_size;
1230
1231 /* Optimization for empty strings */
1232 if (size == 0 && unicode_empty != NULL) {
1233 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001234 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 }
1236
Victor Stinner9e9d6892011-10-04 01:02:02 +02001237 is_ascii = 0;
1238 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239 struct_size = sizeof(PyCompactUnicodeObject);
1240 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001241 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 char_size = 1;
1243 is_ascii = 1;
1244 struct_size = sizeof(PyASCIIObject);
1245 }
1246 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001247 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248 char_size = 1;
1249 }
1250 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001251 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 char_size = 2;
1253 if (sizeof(wchar_t) == 2)
1254 is_sharing = 1;
1255 }
1256 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001257 if (maxchar > MAX_UNICODE) {
1258 PyErr_SetString(PyExc_SystemError,
1259 "invalid maximum character passed to PyUnicode_New");
1260 return NULL;
1261 }
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 4;
1264 if (sizeof(wchar_t) == 4)
1265 is_sharing = 1;
1266 }
1267
1268 /* Ensure we won't overflow the size. */
1269 if (size < 0) {
1270 PyErr_SetString(PyExc_SystemError,
1271 "Negative size passed to PyUnicode_New");
1272 return NULL;
1273 }
1274 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1275 return PyErr_NoMemory();
1276
1277 /* Duplicated allocation code from _PyObject_New() instead of a call to
1278 * PyObject_New() so we are able to allocate space for the object and
1279 * it's data buffer.
1280 */
1281 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1282 if (obj == NULL)
1283 return PyErr_NoMemory();
1284 obj = PyObject_INIT(obj, &PyUnicode_Type);
1285 if (obj == NULL)
1286 return NULL;
1287
1288 unicode = (PyCompactUnicodeObject *)obj;
1289 if (is_ascii)
1290 data = ((PyASCIIObject*)obj) + 1;
1291 else
1292 data = unicode + 1;
1293 _PyUnicode_LENGTH(unicode) = size;
1294 _PyUnicode_HASH(unicode) = -1;
1295 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001296 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 _PyUnicode_STATE(unicode).compact = 1;
1298 _PyUnicode_STATE(unicode).ready = 1;
1299 _PyUnicode_STATE(unicode).ascii = is_ascii;
1300 if (is_ascii) {
1301 ((char*)data)[size] = 0;
1302 _PyUnicode_WSTR(unicode) = NULL;
1303 }
Victor Stinner8f825062012-04-27 13:55:39 +02001304 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 ((char*)data)[size] = 0;
1306 _PyUnicode_WSTR(unicode) = NULL;
1307 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001309 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 else {
1312 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001313 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001314 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001316 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 ((Py_UCS4*)data)[size] = 0;
1318 if (is_sharing) {
1319 _PyUnicode_WSTR_LENGTH(unicode) = size;
1320 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1321 }
1322 else {
1323 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1324 _PyUnicode_WSTR(unicode) = NULL;
1325 }
1326 }
Victor Stinner8f825062012-04-27 13:55:39 +02001327#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001328 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001329#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001330 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 return obj;
1332}
1333
1334#if SIZEOF_WCHAR_T == 2
1335/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1336 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001337 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338
1339 This function assumes that unicode can hold one more code point than wstr
1340 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001341static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001343 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344{
1345 const wchar_t *iter;
1346 Py_UCS4 *ucs4_out;
1347
Victor Stinner910337b2011-10-03 03:20:16 +02001348 assert(unicode != NULL);
1349 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1351 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1352
1353 for (iter = begin; iter < end; ) {
1354 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1355 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001356 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1357 && (iter+1) < end
1358 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 {
Victor Stinner551ac952011-11-29 22:58:13 +01001360 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 iter += 2;
1362 }
1363 else {
1364 *ucs4_out++ = *iter;
1365 iter++;
1366 }
1367 }
1368 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1369 _PyUnicode_GET_LENGTH(unicode)));
1370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371}
1372#endif
1373
Victor Stinnercd9950f2011-10-02 00:34:53 +02001374static int
Victor Stinner488fa492011-12-12 00:01:39 +01001375unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001376{
Victor Stinner488fa492011-12-12 00:01:39 +01001377 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001378 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001379 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001380 return -1;
1381 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001382 return 0;
1383}
1384
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001385static int
1386_copy_characters(PyObject *to, Py_ssize_t to_start,
1387 PyObject *from, Py_ssize_t from_start,
1388 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001390 unsigned int from_kind, to_kind;
1391 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392
Victor Stinneree4544c2012-05-09 22:24:08 +02001393 assert(0 <= how_many);
1394 assert(0 <= from_start);
1395 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001397 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001398 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399
Victor Stinnerd3f08822012-05-29 12:57:52 +02001400 assert(PyUnicode_Check(to));
1401 assert(PyUnicode_IS_READY(to));
1402 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1403
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001404 if (how_many == 0)
1405 return 0;
1406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001408 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001410 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411
Victor Stinnerf1852262012-06-16 16:38:26 +02001412#ifdef Py_DEBUG
1413 if (!check_maxchar
1414 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1415 {
1416 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1417 Py_UCS4 ch;
1418 Py_ssize_t i;
1419 for (i=0; i < how_many; i++) {
1420 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1421 assert(ch <= to_maxchar);
1422 }
1423 }
1424#endif
1425
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001426 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001427 if (check_maxchar
1428 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1429 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001430 /* Writing Latin-1 characters into an ASCII string requires to
1431 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001432 Py_UCS4 max_char;
1433 max_char = ucs1lib_find_max_char(from_data,
1434 (Py_UCS1*)from_data + how_many);
1435 if (max_char >= 128)
1436 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001437 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001438 Py_MEMCPY((char*)to_data + to_kind * to_start,
1439 (char*)from_data + from_kind * from_start,
1440 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001442 else if (from_kind == PyUnicode_1BYTE_KIND
1443 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001444 {
1445 _PyUnicode_CONVERT_BYTES(
1446 Py_UCS1, Py_UCS2,
1447 PyUnicode_1BYTE_DATA(from) + from_start,
1448 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1449 PyUnicode_2BYTE_DATA(to) + to_start
1450 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001451 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001452 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001453 && to_kind == PyUnicode_4BYTE_KIND)
1454 {
1455 _PyUnicode_CONVERT_BYTES(
1456 Py_UCS1, Py_UCS4,
1457 PyUnicode_1BYTE_DATA(from) + from_start,
1458 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1459 PyUnicode_4BYTE_DATA(to) + to_start
1460 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001461 }
1462 else if (from_kind == PyUnicode_2BYTE_KIND
1463 && to_kind == PyUnicode_4BYTE_KIND)
1464 {
1465 _PyUnicode_CONVERT_BYTES(
1466 Py_UCS2, Py_UCS4,
1467 PyUnicode_2BYTE_DATA(from) + from_start,
1468 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1469 PyUnicode_4BYTE_DATA(to) + to_start
1470 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001471 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001472 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001473 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1474
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001475 if (!check_maxchar) {
1476 if (from_kind == PyUnicode_2BYTE_KIND
1477 && to_kind == PyUnicode_1BYTE_KIND)
1478 {
1479 _PyUnicode_CONVERT_BYTES(
1480 Py_UCS2, Py_UCS1,
1481 PyUnicode_2BYTE_DATA(from) + from_start,
1482 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1483 PyUnicode_1BYTE_DATA(to) + to_start
1484 );
1485 }
1486 else if (from_kind == PyUnicode_4BYTE_KIND
1487 && to_kind == PyUnicode_1BYTE_KIND)
1488 {
1489 _PyUnicode_CONVERT_BYTES(
1490 Py_UCS4, Py_UCS1,
1491 PyUnicode_4BYTE_DATA(from) + from_start,
1492 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1493 PyUnicode_1BYTE_DATA(to) + to_start
1494 );
1495 }
1496 else if (from_kind == PyUnicode_4BYTE_KIND
1497 && to_kind == PyUnicode_2BYTE_KIND)
1498 {
1499 _PyUnicode_CONVERT_BYTES(
1500 Py_UCS4, Py_UCS2,
1501 PyUnicode_4BYTE_DATA(from) + from_start,
1502 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1503 PyUnicode_2BYTE_DATA(to) + to_start
1504 );
1505 }
1506 else {
1507 assert(0);
1508 return -1;
1509 }
1510 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001511 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001513 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001514 Py_ssize_t i;
1515
Victor Stinnera0702ab2011-09-29 14:14:38 +02001516 for (i=0; i < how_many; i++) {
1517 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001518 if (ch > to_maxchar)
1519 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001520 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1521 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001522 }
1523 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524 return 0;
1525}
1526
Victor Stinnerd3f08822012-05-29 12:57:52 +02001527void
1528_PyUnicode_FastCopyCharacters(
1529 PyObject *to, Py_ssize_t to_start,
1530 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001531{
1532 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1533}
1534
1535Py_ssize_t
1536PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1537 PyObject *from, Py_ssize_t from_start,
1538 Py_ssize_t how_many)
1539{
1540 int err;
1541
1542 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1543 PyErr_BadInternalCall();
1544 return -1;
1545 }
1546
Benjamin Petersonbac79492012-01-14 13:34:47 -05001547 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001549 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001550 return -1;
1551
Victor Stinnerd3f08822012-05-29 12:57:52 +02001552 if (from_start < 0) {
1553 PyErr_SetString(PyExc_IndexError, "string index out of range");
1554 return -1;
1555 }
1556 if (to_start < 0) {
1557 PyErr_SetString(PyExc_IndexError, "string index out of range");
1558 return -1;
1559 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001560 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1561 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1562 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001563 "Cannot write %zi characters at %zi "
1564 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 how_many, to_start, PyUnicode_GET_LENGTH(to));
1566 return -1;
1567 }
1568
1569 if (how_many == 0)
1570 return 0;
1571
Victor Stinner488fa492011-12-12 00:01:39 +01001572 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001573 return -1;
1574
1575 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1576 if (err) {
1577 PyErr_Format(PyExc_SystemError,
1578 "Cannot copy %s characters "
1579 "into a string of %s characters",
1580 unicode_kind_name(from),
1581 unicode_kind_name(to));
1582 return -1;
1583 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001584 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585}
1586
Victor Stinner17222162011-09-28 22:15:37 +02001587/* Find the maximum code point and count the number of surrogate pairs so a
1588 correct string length can be computed before converting a string to UCS4.
1589 This function counts single surrogates as a character and not as a pair.
1590
1591 Return 0 on success, or -1 on error. */
1592static int
1593find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1594 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595{
1596 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001597 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001598
Victor Stinnerc53be962011-10-02 21:33:54 +02001599 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001600 *num_surrogates = 0;
1601 *maxchar = 0;
1602
1603 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001605 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1606 && (iter+1) < end
1607 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1608 {
1609 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1610 ++(*num_surrogates);
1611 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001612 }
1613 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001615 {
1616 ch = *iter;
1617 iter++;
1618 }
1619 if (ch > *maxchar) {
1620 *maxchar = ch;
1621 if (*maxchar > MAX_UNICODE) {
1622 PyErr_Format(PyExc_ValueError,
1623 "character U+%x is not in range [U+0000; U+10ffff]",
1624 ch);
1625 return -1;
1626 }
1627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001628 }
1629 return 0;
1630}
1631
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001632int
1633_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001634{
1635 wchar_t *end;
1636 Py_UCS4 maxchar = 0;
1637 Py_ssize_t num_surrogates;
1638#if SIZEOF_WCHAR_T == 2
1639 Py_ssize_t length_wo_surrogates;
1640#endif
1641
Georg Brandl7597add2011-10-05 16:36:47 +02001642 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001643 strings were created using _PyObject_New() and where no canonical
1644 representation (the str field) has been set yet aka strings
1645 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001646 assert(_PyUnicode_CHECK(unicode));
1647 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001649 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001650 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001651 /* Actually, it should neither be interned nor be anything else: */
1652 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001654 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001655 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001656 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658
1659 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001660 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1661 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662 PyErr_NoMemory();
1663 return -1;
1664 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001665 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 _PyUnicode_WSTR(unicode), end,
1667 PyUnicode_1BYTE_DATA(unicode));
1668 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1669 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1670 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1671 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001672 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001673 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001674 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 }
1676 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001677 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001678 _PyUnicode_UTF8(unicode) = NULL;
1679 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001680 }
1681 PyObject_FREE(_PyUnicode_WSTR(unicode));
1682 _PyUnicode_WSTR(unicode) = NULL;
1683 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1684 }
1685 /* In this case we might have to convert down from 4-byte native
1686 wchar_t to 2-byte unicode. */
1687 else if (maxchar < 65536) {
1688 assert(num_surrogates == 0 &&
1689 "FindMaxCharAndNumSurrogatePairs() messed up");
1690
Victor Stinner506f5922011-09-28 22:34:18 +02001691#if SIZEOF_WCHAR_T == 2
1692 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001693 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001694 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1695 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1696 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001697 _PyUnicode_UTF8(unicode) = NULL;
1698 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001699#else
1700 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001701 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001702 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001703 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001704 PyErr_NoMemory();
1705 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706 }
Victor Stinner506f5922011-09-28 22:34:18 +02001707 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1708 _PyUnicode_WSTR(unicode), end,
1709 PyUnicode_2BYTE_DATA(unicode));
1710 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1711 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1712 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001713 _PyUnicode_UTF8(unicode) = NULL;
1714 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001715 PyObject_FREE(_PyUnicode_WSTR(unicode));
1716 _PyUnicode_WSTR(unicode) = NULL;
1717 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1718#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 }
1720 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1721 else {
1722#if SIZEOF_WCHAR_T == 2
1723 /* in case the native representation is 2-bytes, we need to allocate a
1724 new normalized 4-byte version. */
1725 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001726 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1727 PyErr_NoMemory();
1728 return -1;
1729 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001730 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1731 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001732 PyErr_NoMemory();
1733 return -1;
1734 }
1735 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1736 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001737 _PyUnicode_UTF8(unicode) = NULL;
1738 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001739 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1740 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001741 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742 PyObject_FREE(_PyUnicode_WSTR(unicode));
1743 _PyUnicode_WSTR(unicode) = NULL;
1744 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1745#else
1746 assert(num_surrogates == 0);
1747
Victor Stinnerc3c74152011-10-02 20:39:55 +02001748 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001750 _PyUnicode_UTF8(unicode) = NULL;
1751 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1753#endif
1754 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1755 }
1756 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001757 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 return 0;
1759}
1760
Alexander Belopolsky40018472011-02-26 01:02:56 +00001761static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001762unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763{
Walter Dörwald16807132007-05-25 13:52:07 +00001764 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001765 case SSTATE_NOT_INTERNED:
1766 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001767
Benjamin Peterson29060642009-01-31 22:14:21 +00001768 case SSTATE_INTERNED_MORTAL:
1769 /* revive dead object temporarily for DelItem */
1770 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001771 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001772 Py_FatalError(
1773 "deletion of interned string failed");
1774 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001775
Benjamin Peterson29060642009-01-31 22:14:21 +00001776 case SSTATE_INTERNED_IMMORTAL:
1777 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001778
Benjamin Peterson29060642009-01-31 22:14:21 +00001779 default:
1780 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001781 }
1782
Victor Stinner03490912011-10-03 23:45:12 +02001783 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001785 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001786 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001787 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1788 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001790 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791}
1792
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001793#ifdef Py_DEBUG
1794static int
1795unicode_is_singleton(PyObject *unicode)
1796{
1797 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1798 if (unicode == unicode_empty)
1799 return 1;
1800 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1801 {
1802 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1803 if (ch < 256 && unicode_latin1[ch] == unicode)
1804 return 1;
1805 }
1806 return 0;
1807}
1808#endif
1809
Alexander Belopolsky40018472011-02-26 01:02:56 +00001810static int
Victor Stinner488fa492011-12-12 00:01:39 +01001811unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001812{
Victor Stinner488fa492011-12-12 00:01:39 +01001813 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001814 if (Py_REFCNT(unicode) != 1)
1815 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001816 if (_PyUnicode_HASH(unicode) != -1)
1817 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001818 if (PyUnicode_CHECK_INTERNED(unicode))
1819 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001820 if (!PyUnicode_CheckExact(unicode))
1821 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001822#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001823 /* singleton refcount is greater than 1 */
1824 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001825#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001826 return 1;
1827}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001828
Victor Stinnerfe226c02011-10-03 03:52:20 +02001829static int
1830unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1831{
1832 PyObject *unicode;
1833 Py_ssize_t old_length;
1834
1835 assert(p_unicode != NULL);
1836 unicode = *p_unicode;
1837
1838 assert(unicode != NULL);
1839 assert(PyUnicode_Check(unicode));
1840 assert(0 <= length);
1841
Victor Stinner910337b2011-10-03 03:20:16 +02001842 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001843 old_length = PyUnicode_WSTR_LENGTH(unicode);
1844 else
1845 old_length = PyUnicode_GET_LENGTH(unicode);
1846 if (old_length == length)
1847 return 0;
1848
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001849 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001850 _Py_INCREF_UNICODE_EMPTY();
1851 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001852 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001853 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001854 return 0;
1855 }
1856
Victor Stinner488fa492011-12-12 00:01:39 +01001857 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001858 PyObject *copy = resize_copy(unicode, length);
1859 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001860 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001861 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001862 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001863 }
1864
Victor Stinnerfe226c02011-10-03 03:52:20 +02001865 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001866 PyObject *new_unicode = resize_compact(unicode, length);
1867 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001868 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001869 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001870 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001871 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001872 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001873}
1874
Alexander Belopolsky40018472011-02-26 01:02:56 +00001875int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001876PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001877{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001878 PyObject *unicode;
1879 if (p_unicode == NULL) {
1880 PyErr_BadInternalCall();
1881 return -1;
1882 }
1883 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001884 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001885 {
1886 PyErr_BadInternalCall();
1887 return -1;
1888 }
1889 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001890}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001891
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001892/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001893
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001894 WARNING: The function doesn't copy the terminating null character and
1895 doesn't check the maximum character (may write a latin1 character in an
1896 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001897static void
1898unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1899 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001900{
1901 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1902 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001903 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001904
1905 switch (kind) {
1906 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001907 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001908#ifdef Py_DEBUG
1909 if (PyUnicode_IS_ASCII(unicode)) {
1910 Py_UCS4 maxchar = ucs1lib_find_max_char(
1911 (const Py_UCS1*)str,
1912 (const Py_UCS1*)str + len);
1913 assert(maxchar < 128);
1914 }
1915#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001916 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001917 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001918 }
1919 case PyUnicode_2BYTE_KIND: {
1920 Py_UCS2 *start = (Py_UCS2 *)data + index;
1921 Py_UCS2 *ucs2 = start;
1922 assert(index <= PyUnicode_GET_LENGTH(unicode));
1923
Victor Stinner184252a2012-06-16 02:57:41 +02001924 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001925 *ucs2 = (Py_UCS2)*str;
1926
1927 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001928 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001929 }
1930 default: {
1931 Py_UCS4 *start = (Py_UCS4 *)data + index;
1932 Py_UCS4 *ucs4 = start;
1933 assert(kind == PyUnicode_4BYTE_KIND);
1934 assert(index <= PyUnicode_GET_LENGTH(unicode));
1935
Victor Stinner184252a2012-06-16 02:57:41 +02001936 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001937 *ucs4 = (Py_UCS4)*str;
1938
1939 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001940 }
1941 }
1942}
1943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001944static PyObject*
1945get_latin1_char(unsigned char ch)
1946{
Victor Stinnera464fc12011-10-02 20:39:30 +02001947 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001949 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 if (!unicode)
1951 return NULL;
1952 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001953 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 unicode_latin1[ch] = unicode;
1955 }
1956 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001957 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958}
1959
Victor Stinner985a82a2014-01-03 12:53:47 +01001960static PyObject*
1961unicode_char(Py_UCS4 ch)
1962{
1963 PyObject *unicode;
1964
1965 assert(ch <= MAX_UNICODE);
1966
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001967 if (ch < 256)
1968 return get_latin1_char(ch);
1969
Victor Stinner985a82a2014-01-03 12:53:47 +01001970 unicode = PyUnicode_New(1, ch);
1971 if (unicode == NULL)
1972 return NULL;
1973 switch (PyUnicode_KIND(unicode)) {
1974 case PyUnicode_1BYTE_KIND:
1975 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1976 break;
1977 case PyUnicode_2BYTE_KIND:
1978 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1979 break;
1980 default:
1981 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1982 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1983 }
1984 assert(_PyUnicode_CheckConsistency(unicode, 1));
1985 return unicode;
1986}
1987
Alexander Belopolsky40018472011-02-26 01:02:56 +00001988PyObject *
1989PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001991 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 Py_UCS4 maxchar = 0;
1993 Py_ssize_t num_surrogates;
1994
1995 if (u == NULL)
1996 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001998 /* If the Unicode data is known at construction time, we can apply
1999 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002002 if (size == 0)
2003 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 /* Single character Unicode objects in the Latin-1 range are
2006 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002007 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 return get_latin1_char((unsigned char)*u);
2009
2010 /* If not empty and not single character, copy the Unicode data
2011 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002012 if (find_maxchar_surrogates(u, u + size,
2013 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 return NULL;
2015
Victor Stinner8faf8212011-12-08 22:14:11 +01002016 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017 if (!unicode)
2018 return NULL;
2019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002020 switch (PyUnicode_KIND(unicode)) {
2021 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002022 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002023 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2024 break;
2025 case PyUnicode_2BYTE_KIND:
2026#if Py_UNICODE_SIZE == 2
2027 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2028#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002029 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2031#endif
2032 break;
2033 case PyUnicode_4BYTE_KIND:
2034#if SIZEOF_WCHAR_T == 2
2035 /* This is the only case which has to process surrogates, thus
2036 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002037 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002038#else
2039 assert(num_surrogates == 0);
2040 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2041#endif
2042 break;
2043 default:
2044 assert(0 && "Impossible state");
2045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002047 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048}
2049
Alexander Belopolsky40018472011-02-26 01:02:56 +00002050PyObject *
2051PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002052{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002053 if (size < 0) {
2054 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002055 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002056 return NULL;
2057 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002058 if (u != NULL)
2059 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2060 else
2061 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002062}
2063
Alexander Belopolsky40018472011-02-26 01:02:56 +00002064PyObject *
2065PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002066{
2067 size_t size = strlen(u);
2068 if (size > PY_SSIZE_T_MAX) {
2069 PyErr_SetString(PyExc_OverflowError, "input too long");
2070 return NULL;
2071 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002072 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002073}
2074
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002075PyObject *
2076_PyUnicode_FromId(_Py_Identifier *id)
2077{
2078 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002079 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2080 strlen(id->string),
2081 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002082 if (!id->object)
2083 return NULL;
2084 PyUnicode_InternInPlace(&id->object);
2085 assert(!id->next);
2086 id->next = static_strings;
2087 static_strings = id;
2088 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002089 return id->object;
2090}
2091
2092void
2093_PyUnicode_ClearStaticStrings()
2094{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002095 _Py_Identifier *tmp, *s = static_strings;
2096 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002097 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002098 tmp = s->next;
2099 s->next = NULL;
2100 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002101 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002102 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002103}
2104
Benjamin Peterson0df54292012-03-26 14:50:32 -04002105/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002106
Victor Stinnerd3f08822012-05-29 12:57:52 +02002107PyObject*
2108_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002109{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002110 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002111 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002112 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002113#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002114 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002115#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002116 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002117 }
Victor Stinner785938e2011-12-11 20:09:03 +01002118 unicode = PyUnicode_New(size, 127);
2119 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002120 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002121 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2122 assert(_PyUnicode_CheckConsistency(unicode, 1));
2123 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002124}
2125
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002126static Py_UCS4
2127kind_maxchar_limit(unsigned int kind)
2128{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002129 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002130 case PyUnicode_1BYTE_KIND:
2131 return 0x80;
2132 case PyUnicode_2BYTE_KIND:
2133 return 0x100;
2134 case PyUnicode_4BYTE_KIND:
2135 return 0x10000;
2136 default:
2137 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002138 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002139 }
2140}
2141
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002142static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002143align_maxchar(Py_UCS4 maxchar)
2144{
2145 if (maxchar <= 127)
2146 return 127;
2147 else if (maxchar <= 255)
2148 return 255;
2149 else if (maxchar <= 65535)
2150 return 65535;
2151 else
2152 return MAX_UNICODE;
2153}
2154
Victor Stinner702c7342011-10-05 13:50:52 +02002155static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002156_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002157{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002158 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002159 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002160
Serhiy Storchaka678db842013-01-26 12:16:36 +02002161 if (size == 0)
2162 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002163 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002164 if (size == 1)
2165 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002166
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002167 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002168 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 if (!res)
2170 return NULL;
2171 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002172 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002174}
2175
Victor Stinnere57b1c02011-09-28 22:20:48 +02002176static PyObject*
2177_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178{
2179 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002180 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002181
Serhiy Storchaka678db842013-01-26 12:16:36 +02002182 if (size == 0)
2183 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002184 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002185 if (size == 1)
2186 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002187
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002188 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002189 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190 if (!res)
2191 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002192 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002194 else {
2195 _PyUnicode_CONVERT_BYTES(
2196 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2197 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002198 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199 return res;
2200}
2201
Victor Stinnere57b1c02011-09-28 22:20:48 +02002202static PyObject*
2203_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204{
2205 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002206 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002207
Serhiy Storchaka678db842013-01-26 12:16:36 +02002208 if (size == 0)
2209 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002210 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002211 if (size == 1)
2212 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002213
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002214 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002215 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 if (!res)
2217 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002218 if (max_char < 256)
2219 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2220 PyUnicode_1BYTE_DATA(res));
2221 else if (max_char < 0x10000)
2222 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2223 PyUnicode_2BYTE_DATA(res));
2224 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002226 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 return res;
2228}
2229
2230PyObject*
2231PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2232{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002233 if (size < 0) {
2234 PyErr_SetString(PyExc_ValueError, "size must be positive");
2235 return NULL;
2236 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002237 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002239 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002241 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002243 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002244 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002245 PyErr_SetString(PyExc_SystemError, "invalid kind");
2246 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248}
2249
Victor Stinnerece58de2012-04-23 23:36:38 +02002250Py_UCS4
2251_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2252{
2253 enum PyUnicode_Kind kind;
2254 void *startptr, *endptr;
2255
2256 assert(PyUnicode_IS_READY(unicode));
2257 assert(0 <= start);
2258 assert(end <= PyUnicode_GET_LENGTH(unicode));
2259 assert(start <= end);
2260
2261 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2262 return PyUnicode_MAX_CHAR_VALUE(unicode);
2263
2264 if (start == end)
2265 return 127;
2266
Victor Stinner94d558b2012-04-27 22:26:58 +02002267 if (PyUnicode_IS_ASCII(unicode))
2268 return 127;
2269
Victor Stinnerece58de2012-04-23 23:36:38 +02002270 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002271 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002272 endptr = (char *)startptr + end * kind;
2273 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002274 switch(kind) {
2275 case PyUnicode_1BYTE_KIND:
2276 return ucs1lib_find_max_char(startptr, endptr);
2277 case PyUnicode_2BYTE_KIND:
2278 return ucs2lib_find_max_char(startptr, endptr);
2279 case PyUnicode_4BYTE_KIND:
2280 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002281 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002282 assert(0);
2283 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002284 }
2285}
2286
Victor Stinner25a4b292011-10-06 12:31:55 +02002287/* Ensure that a string uses the most efficient storage, if it is not the
2288 case: create a new string with of the right kind. Write NULL into *p_unicode
2289 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002290static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002291unicode_adjust_maxchar(PyObject **p_unicode)
2292{
2293 PyObject *unicode, *copy;
2294 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002295 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002296 unsigned int kind;
2297
2298 assert(p_unicode != NULL);
2299 unicode = *p_unicode;
2300 assert(PyUnicode_IS_READY(unicode));
2301 if (PyUnicode_IS_ASCII(unicode))
2302 return;
2303
2304 len = PyUnicode_GET_LENGTH(unicode);
2305 kind = PyUnicode_KIND(unicode);
2306 if (kind == PyUnicode_1BYTE_KIND) {
2307 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002308 max_char = ucs1lib_find_max_char(u, u + len);
2309 if (max_char >= 128)
2310 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002311 }
2312 else if (kind == PyUnicode_2BYTE_KIND) {
2313 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002314 max_char = ucs2lib_find_max_char(u, u + len);
2315 if (max_char >= 256)
2316 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002317 }
2318 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002319 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002320 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002321 max_char = ucs4lib_find_max_char(u, u + len);
2322 if (max_char >= 0x10000)
2323 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002324 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002325 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002326 if (copy != NULL)
2327 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002328 Py_DECREF(unicode);
2329 *p_unicode = copy;
2330}
2331
Victor Stinner034f6cf2011-09-30 02:26:44 +02002332PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002333_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002334{
Victor Stinner87af4f22011-11-21 23:03:47 +01002335 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002336 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002337
Victor Stinner034f6cf2011-09-30 02:26:44 +02002338 if (!PyUnicode_Check(unicode)) {
2339 PyErr_BadInternalCall();
2340 return NULL;
2341 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002342 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002343 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002344
Victor Stinner87af4f22011-11-21 23:03:47 +01002345 length = PyUnicode_GET_LENGTH(unicode);
2346 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002347 if (!copy)
2348 return NULL;
2349 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2350
Victor Stinner87af4f22011-11-21 23:03:47 +01002351 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2352 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002353 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002354 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002355}
2356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002357
Victor Stinnerbc603d12011-10-02 01:00:40 +02002358/* Widen Unicode objects to larger buffers. Don't write terminating null
2359 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360
2361void*
2362_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2363{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002364 Py_ssize_t len;
2365 void *result;
2366 unsigned int skind;
2367
Benjamin Petersonbac79492012-01-14 13:34:47 -05002368 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002369 return NULL;
2370
2371 len = PyUnicode_GET_LENGTH(s);
2372 skind = PyUnicode_KIND(s);
2373 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002374 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002375 return NULL;
2376 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002377 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002378 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002379 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002380 if (!result)
2381 return PyErr_NoMemory();
2382 assert(skind == PyUnicode_1BYTE_KIND);
2383 _PyUnicode_CONVERT_BYTES(
2384 Py_UCS1, Py_UCS2,
2385 PyUnicode_1BYTE_DATA(s),
2386 PyUnicode_1BYTE_DATA(s) + len,
2387 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002388 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002389 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002390 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002391 if (!result)
2392 return PyErr_NoMemory();
2393 if (skind == PyUnicode_2BYTE_KIND) {
2394 _PyUnicode_CONVERT_BYTES(
2395 Py_UCS2, Py_UCS4,
2396 PyUnicode_2BYTE_DATA(s),
2397 PyUnicode_2BYTE_DATA(s) + len,
2398 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002399 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002400 else {
2401 assert(skind == PyUnicode_1BYTE_KIND);
2402 _PyUnicode_CONVERT_BYTES(
2403 Py_UCS1, Py_UCS4,
2404 PyUnicode_1BYTE_DATA(s),
2405 PyUnicode_1BYTE_DATA(s) + len,
2406 result);
2407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002409 default:
2410 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 }
Victor Stinner01698042011-10-04 00:04:26 +02002412 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 return NULL;
2414}
2415
2416static Py_UCS4*
2417as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2418 int copy_null)
2419{
2420 int kind;
2421 void *data;
2422 Py_ssize_t len, targetlen;
2423 if (PyUnicode_READY(string) == -1)
2424 return NULL;
2425 kind = PyUnicode_KIND(string);
2426 data = PyUnicode_DATA(string);
2427 len = PyUnicode_GET_LENGTH(string);
2428 targetlen = len;
2429 if (copy_null)
2430 targetlen++;
2431 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002432 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 if (!target) {
2434 PyErr_NoMemory();
2435 return NULL;
2436 }
2437 }
2438 else {
2439 if (targetsize < targetlen) {
2440 PyErr_Format(PyExc_SystemError,
2441 "string is longer than the buffer");
2442 if (copy_null && 0 < targetsize)
2443 target[0] = 0;
2444 return NULL;
2445 }
2446 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002447 if (kind == PyUnicode_1BYTE_KIND) {
2448 Py_UCS1 *start = (Py_UCS1 *) data;
2449 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002451 else if (kind == PyUnicode_2BYTE_KIND) {
2452 Py_UCS2 *start = (Py_UCS2 *) data;
2453 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2454 }
2455 else {
2456 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002458 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 if (copy_null)
2460 target[len] = 0;
2461 return target;
2462}
2463
2464Py_UCS4*
2465PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2466 int copy_null)
2467{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002468 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 PyErr_BadInternalCall();
2470 return NULL;
2471 }
2472 return as_ucs4(string, target, targetsize, copy_null);
2473}
2474
2475Py_UCS4*
2476PyUnicode_AsUCS4Copy(PyObject *string)
2477{
2478 return as_ucs4(string, NULL, 0, 1);
2479}
2480
2481#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002482
Alexander Belopolsky40018472011-02-26 01:02:56 +00002483PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002484PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002488 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002489 PyErr_BadInternalCall();
2490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 }
2492
Martin v. Löwis790465f2008-04-05 20:41:37 +00002493 if (size == -1) {
2494 size = wcslen(w);
2495 }
2496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498}
2499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002501
Victor Stinner15a11362012-10-06 23:48:20 +02002502/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002503 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2504 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2505#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002506
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002507static int
2508unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2509 Py_ssize_t width, Py_ssize_t precision)
2510{
2511 Py_ssize_t length, fill, arglen;
2512 Py_UCS4 maxchar;
2513
2514 if (PyUnicode_READY(str) == -1)
2515 return -1;
2516
2517 length = PyUnicode_GET_LENGTH(str);
2518 if ((precision == -1 || precision >= length)
2519 && width <= length)
2520 return _PyUnicodeWriter_WriteStr(writer, str);
2521
2522 if (precision != -1)
2523 length = Py_MIN(precision, length);
2524
2525 arglen = Py_MAX(length, width);
2526 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2527 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2528 else
2529 maxchar = writer->maxchar;
2530
2531 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2532 return -1;
2533
2534 if (width > length) {
2535 fill = width - length;
2536 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2537 return -1;
2538 writer->pos += fill;
2539 }
2540
2541 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2542 str, 0, length);
2543 writer->pos += length;
2544 return 0;
2545}
2546
2547static int
2548unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2549 Py_ssize_t width, Py_ssize_t precision)
2550{
2551 /* UTF-8 */
2552 Py_ssize_t length;
2553 PyObject *unicode;
2554 int res;
2555
2556 length = strlen(str);
2557 if (precision != -1)
2558 length = Py_MIN(length, precision);
2559 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2560 if (unicode == NULL)
2561 return -1;
2562
2563 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2564 Py_DECREF(unicode);
2565 return res;
2566}
2567
Victor Stinner96865452011-03-01 23:44:09 +00002568static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002569unicode_fromformat_arg(_PyUnicodeWriter *writer,
2570 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002571{
Victor Stinnere215d962012-10-06 23:03:36 +02002572 const char *p;
2573 Py_ssize_t len;
2574 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002575 Py_ssize_t width;
2576 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002577 int longflag;
2578 int longlongflag;
2579 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002580 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002581
2582 p = f;
2583 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002584 zeropad = 0;
2585 if (*f == '0') {
2586 zeropad = 1;
2587 f++;
2588 }
Victor Stinner96865452011-03-01 23:44:09 +00002589
2590 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002591 width = -1;
2592 if (Py_ISDIGIT((unsigned)*f)) {
2593 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002594 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002595 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002596 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002597 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002598 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002599 return NULL;
2600 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002601 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002602 f++;
2603 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002604 }
2605 precision = -1;
2606 if (*f == '.') {
2607 f++;
2608 if (Py_ISDIGIT((unsigned)*f)) {
2609 precision = (*f - '0');
2610 f++;
2611 while (Py_ISDIGIT((unsigned)*f)) {
2612 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2613 PyErr_SetString(PyExc_ValueError,
2614 "precision too big");
2615 return NULL;
2616 }
2617 precision = (precision * 10) + (*f - '0');
2618 f++;
2619 }
2620 }
Victor Stinner96865452011-03-01 23:44:09 +00002621 if (*f == '%') {
2622 /* "%.3%s" => f points to "3" */
2623 f--;
2624 }
2625 }
2626 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002627 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002628 f--;
2629 }
Victor Stinner96865452011-03-01 23:44:09 +00002630
2631 /* Handle %ld, %lu, %lld and %llu. */
2632 longflag = 0;
2633 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002634 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002635 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002636 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002637 longflag = 1;
2638 ++f;
2639 }
Victor Stinner96865452011-03-01 23:44:09 +00002640 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002641 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002642 longlongflag = 1;
2643 f += 2;
2644 }
Victor Stinner96865452011-03-01 23:44:09 +00002645 }
2646 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002647 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002648 size_tflag = 1;
2649 ++f;
2650 }
Victor Stinnere215d962012-10-06 23:03:36 +02002651
2652 if (f[1] == '\0')
2653 writer->overallocate = 0;
2654
2655 switch (*f) {
2656 case 'c':
2657 {
2658 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002659 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002660 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002661 "character argument not in range(0x110000)");
2662 return NULL;
2663 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002664 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002665 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002666 break;
2667 }
2668
2669 case 'i':
2670 case 'd':
2671 case 'u':
2672 case 'x':
2673 {
2674 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002675 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002676 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002677
2678 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002679 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002680 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002681 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002682 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002683 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002684 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002685 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002686 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002687 va_arg(*vargs, size_t));
2688 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002689 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002690 va_arg(*vargs, unsigned int));
2691 }
2692 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002693 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002694 }
2695 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002696 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002697 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002698 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002699 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002700 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002701 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002702 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002703 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002704 va_arg(*vargs, Py_ssize_t));
2705 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002706 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002707 va_arg(*vargs, int));
2708 }
2709 assert(len >= 0);
2710
Victor Stinnere215d962012-10-06 23:03:36 +02002711 if (precision < len)
2712 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002713
2714 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002715 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2716 return NULL;
2717
Victor Stinnere215d962012-10-06 23:03:36 +02002718 if (width > precision) {
2719 Py_UCS4 fillchar;
2720 fill = width - precision;
2721 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002722 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2723 return NULL;
2724 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002725 }
Victor Stinner15a11362012-10-06 23:48:20 +02002726 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002727 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002728 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2729 return NULL;
2730 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002731 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002732
Victor Stinner4a587072013-11-19 12:54:53 +01002733 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2734 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002735 break;
2736 }
2737
2738 case 'p':
2739 {
2740 char number[MAX_LONG_LONG_CHARS];
2741
2742 len = sprintf(number, "%p", va_arg(*vargs, void*));
2743 assert(len >= 0);
2744
2745 /* %p is ill-defined: ensure leading 0x. */
2746 if (number[1] == 'X')
2747 number[1] = 'x';
2748 else if (number[1] != 'x') {
2749 memmove(number + 2, number,
2750 strlen(number) + 1);
2751 number[0] = '0';
2752 number[1] = 'x';
2753 len += 2;
2754 }
2755
Victor Stinner4a587072013-11-19 12:54:53 +01002756 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002757 return NULL;
2758 break;
2759 }
2760
2761 case 's':
2762 {
2763 /* UTF-8 */
2764 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002765 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002766 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002767 break;
2768 }
2769
2770 case 'U':
2771 {
2772 PyObject *obj = va_arg(*vargs, PyObject *);
2773 assert(obj && _PyUnicode_CHECK(obj));
2774
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002775 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002776 return NULL;
2777 break;
2778 }
2779
2780 case 'V':
2781 {
2782 PyObject *obj = va_arg(*vargs, PyObject *);
2783 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002784 if (obj) {
2785 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002786 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002787 return NULL;
2788 }
2789 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 assert(str != NULL);
2791 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002792 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002793 }
2794 break;
2795 }
2796
2797 case 'S':
2798 {
2799 PyObject *obj = va_arg(*vargs, PyObject *);
2800 PyObject *str;
2801 assert(obj);
2802 str = PyObject_Str(obj);
2803 if (!str)
2804 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002805 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002806 Py_DECREF(str);
2807 return NULL;
2808 }
2809 Py_DECREF(str);
2810 break;
2811 }
2812
2813 case 'R':
2814 {
2815 PyObject *obj = va_arg(*vargs, PyObject *);
2816 PyObject *repr;
2817 assert(obj);
2818 repr = PyObject_Repr(obj);
2819 if (!repr)
2820 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002821 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002822 Py_DECREF(repr);
2823 return NULL;
2824 }
2825 Py_DECREF(repr);
2826 break;
2827 }
2828
2829 case 'A':
2830 {
2831 PyObject *obj = va_arg(*vargs, PyObject *);
2832 PyObject *ascii;
2833 assert(obj);
2834 ascii = PyObject_ASCII(obj);
2835 if (!ascii)
2836 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002837 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002838 Py_DECREF(ascii);
2839 return NULL;
2840 }
2841 Py_DECREF(ascii);
2842 break;
2843 }
2844
2845 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002846 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002847 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002848 break;
2849
2850 default:
2851 /* if we stumble upon an unknown formatting code, copy the rest
2852 of the format string to the output string. (we cannot just
2853 skip the code, since there's no way to know what's in the
2854 argument list) */
2855 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002856 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002857 return NULL;
2858 f = p+len;
2859 return f;
2860 }
2861
2862 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002863 return f;
2864}
2865
Walter Dörwaldd2034312007-05-18 16:29:38 +00002866PyObject *
2867PyUnicode_FromFormatV(const char *format, va_list vargs)
2868{
Victor Stinnere215d962012-10-06 23:03:36 +02002869 va_list vargs2;
2870 const char *f;
2871 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002872
Victor Stinner8f674cc2013-04-17 23:02:17 +02002873 _PyUnicodeWriter_Init(&writer);
2874 writer.min_length = strlen(format) + 100;
2875 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002876
2877 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2878 Copy it to be able to pass a reference to a subfunction. */
2879 Py_VA_COPY(vargs2, vargs);
2880
2881 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002882 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002883 f = unicode_fromformat_arg(&writer, f, &vargs2);
2884 if (f == NULL)
2885 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002886 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002887 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002888 const char *p;
2889 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002890
Victor Stinnere215d962012-10-06 23:03:36 +02002891 p = f;
2892 do
2893 {
2894 if ((unsigned char)*p > 127) {
2895 PyErr_Format(PyExc_ValueError,
2896 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2897 "string, got a non-ASCII byte: 0x%02x",
2898 (unsigned char)*p);
2899 return NULL;
2900 }
2901 p++;
2902 }
2903 while (*p != '\0' && *p != '%');
2904 len = p - f;
2905
2906 if (*p == '\0')
2907 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002908
2909 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002910 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002911
2912 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002913 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002914 }
Victor Stinnere215d962012-10-06 23:03:36 +02002915 return _PyUnicodeWriter_Finish(&writer);
2916
2917 fail:
2918 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002919 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002920}
2921
Walter Dörwaldd2034312007-05-18 16:29:38 +00002922PyObject *
2923PyUnicode_FromFormat(const char *format, ...)
2924{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002925 PyObject* ret;
2926 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002927
2928#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002929 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002930#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002931 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002932#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002933 ret = PyUnicode_FromFormatV(format, vargs);
2934 va_end(vargs);
2935 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002936}
2937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002938#ifdef HAVE_WCHAR_H
2939
Victor Stinner5593d8a2010-10-02 11:11:27 +00002940/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2941 convert a Unicode object to a wide character string.
2942
Victor Stinnerd88d9832011-09-06 02:00:05 +02002943 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002944 character) required to convert the unicode object. Ignore size argument.
2945
Victor Stinnerd88d9832011-09-06 02:00:05 +02002946 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002947 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002948 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002949static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002950unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002951 wchar_t *w,
2952 Py_ssize_t size)
2953{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002954 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002955 const wchar_t *wstr;
2956
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002957 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002958 if (wstr == NULL)
2959 return -1;
2960
Victor Stinner5593d8a2010-10-02 11:11:27 +00002961 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002962 if (size > res)
2963 size = res + 1;
2964 else
2965 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002966 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002967 return res;
2968 }
2969 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002970 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002971}
2972
2973Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002974PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002975 wchar_t *w,
2976 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977{
2978 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002979 PyErr_BadInternalCall();
2980 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002982 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983}
2984
Victor Stinner137c34c2010-09-29 10:25:54 +00002985wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002986PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002987 Py_ssize_t *size)
2988{
2989 wchar_t* buffer;
2990 Py_ssize_t buflen;
2991
2992 if (unicode == NULL) {
2993 PyErr_BadInternalCall();
2994 return NULL;
2995 }
2996
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002997 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002998 if (buflen == -1)
2999 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003000 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003001 if (buffer == NULL) {
3002 PyErr_NoMemory();
3003 return NULL;
3004 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003005 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003006 if (buflen == -1) {
3007 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003008 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003009 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003010 if (size != NULL)
3011 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003012 return buffer;
3013}
3014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003015#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016
Alexander Belopolsky40018472011-02-26 01:02:56 +00003017PyObject *
3018PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003019{
Victor Stinner8faf8212011-12-08 22:14:11 +01003020 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003021 PyErr_SetString(PyExc_ValueError,
3022 "chr() arg not in range(0x110000)");
3023 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003024 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003025
Victor Stinner985a82a2014-01-03 12:53:47 +01003026 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003027}
3028
Alexander Belopolsky40018472011-02-26 01:02:56 +00003029PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003030PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003032 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003033 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003034 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003035 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003036 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003037 Py_INCREF(obj);
3038 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003039 }
3040 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003041 /* For a Unicode subtype that's not a Unicode object,
3042 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003043 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003044 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003045 PyErr_Format(PyExc_TypeError,
3046 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003047 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003048 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003049}
3050
Alexander Belopolsky40018472011-02-26 01:02:56 +00003051PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003052PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003053 const char *encoding,
3054 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003055{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003056 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003057 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003058
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003060 PyErr_BadInternalCall();
3061 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003063
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003064 /* Decoding bytes objects is the most common case and should be fast */
3065 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003066 if (PyBytes_GET_SIZE(obj) == 0)
3067 _Py_RETURN_UNICODE_EMPTY();
3068 v = PyUnicode_Decode(
3069 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3070 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003071 return v;
3072 }
3073
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003074 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003075 PyErr_SetString(PyExc_TypeError,
3076 "decoding str is not supported");
3077 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003078 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003079
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003080 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3081 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3082 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003083 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003084 Py_TYPE(obj)->tp_name);
3085 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003086 }
Tim Petersced69f82003-09-16 20:30:58 +00003087
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003088 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003089 PyBuffer_Release(&buffer);
3090 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003092
Serhiy Storchaka05997252013-01-26 12:14:02 +02003093 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003094 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003095 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096}
3097
Victor Stinner942889a2016-09-05 15:40:10 -07003098/* Normalize an encoding name: C implementation of
3099 encodings.normalize_encoding(). Return 1 on success, or 0 on error (encoding
3100 is longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003101int
3102_Py_normalize_encoding(const char *encoding,
3103 char *lower,
3104 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003106 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003107 char *l;
3108 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003109 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003110
Victor Stinner942889a2016-09-05 15:40:10 -07003111 assert(encoding != NULL);
3112
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003113 e = encoding;
3114 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003115 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003116 punct = 0;
3117 while (1) {
3118 char c = *e;
3119 if (c == 0) {
3120 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003121 }
Victor Stinner942889a2016-09-05 15:40:10 -07003122
3123 if (Py_ISALNUM(c) || c == '.') {
3124 if (punct && l != lower) {
3125 if (l == l_end) {
3126 return 0;
3127 }
3128 *l++ = '_';
3129 }
3130 punct = 0;
3131
3132 if (l == l_end) {
3133 return 0;
3134 }
3135 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003136 }
3137 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003138 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003139 }
Victor Stinner942889a2016-09-05 15:40:10 -07003140
3141 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003142 }
3143 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003144 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003145}
3146
Alexander Belopolsky40018472011-02-26 01:02:56 +00003147PyObject *
3148PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003149 Py_ssize_t size,
3150 const char *encoding,
3151 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003152{
3153 PyObject *buffer = NULL, *unicode;
3154 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003155 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3156
3157 if (encoding == NULL) {
3158 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3159 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003160
Fred Drakee4315f52000-05-09 19:53:39 +00003161 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003162 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3163 char *lower = buflower;
3164
3165 /* Fast paths */
3166 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3167 lower += 3;
3168 if (*lower == '_') {
3169 /* Match "utf8" and "utf_8" */
3170 lower++;
3171 }
3172
3173 if (lower[0] == '8' && lower[1] == 0) {
3174 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3175 }
3176 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3177 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3178 }
3179 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3180 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3181 }
3182 }
3183 else {
3184 if (strcmp(lower, "ascii") == 0
3185 || strcmp(lower, "us_ascii") == 0) {
3186 return PyUnicode_DecodeASCII(s, size, errors);
3187 }
3188 #ifdef HAVE_MBCS
3189 else if (strcmp(lower, "mbcs") == 0) {
3190 return PyUnicode_DecodeMBCS(s, size, errors);
3191 }
3192 #endif
3193 else if (strcmp(lower, "latin1") == 0
3194 || strcmp(lower, "latin_1") == 0
3195 || strcmp(lower, "iso_8859_1") == 0
3196 || strcmp(lower, "iso8859_1") == 0) {
3197 return PyUnicode_DecodeLatin1(s, size, errors);
3198 }
3199 }
Victor Stinner37296e82010-06-10 13:36:23 +00003200 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003201
3202 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003203 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003204 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003205 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003206 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 if (buffer == NULL)
3208 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003209 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 if (unicode == NULL)
3211 goto onError;
3212 if (!PyUnicode_Check(unicode)) {
3213 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003214 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3215 "use codecs.decode() to decode to arbitrary types",
3216 encoding,
3217 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 Py_DECREF(unicode);
3219 goto onError;
3220 }
3221 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003222 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003223
Benjamin Peterson29060642009-01-31 22:14:21 +00003224 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225 Py_XDECREF(buffer);
3226 return NULL;
3227}
3228
Alexander Belopolsky40018472011-02-26 01:02:56 +00003229PyObject *
3230PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003231 const char *encoding,
3232 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003233{
3234 PyObject *v;
3235
3236 if (!PyUnicode_Check(unicode)) {
3237 PyErr_BadArgument();
3238 goto onError;
3239 }
3240
3241 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003242 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003243
3244 /* Decode via the codec registry */
3245 v = PyCodec_Decode(unicode, encoding, errors);
3246 if (v == NULL)
3247 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003248 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003249
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003251 return NULL;
3252}
3253
Alexander Belopolsky40018472011-02-26 01:02:56 +00003254PyObject *
3255PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003256 const char *encoding,
3257 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003258{
3259 PyObject *v;
3260
3261 if (!PyUnicode_Check(unicode)) {
3262 PyErr_BadArgument();
3263 goto onError;
3264 }
3265
3266 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003268
3269 /* Decode via the codec registry */
3270 v = PyCodec_Decode(unicode, encoding, errors);
3271 if (v == NULL)
3272 goto onError;
3273 if (!PyUnicode_Check(v)) {
3274 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003275 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3276 "use codecs.decode() to decode to arbitrary types",
3277 encoding,
3278 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003279 Py_DECREF(v);
3280 goto onError;
3281 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003282 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003283
Benjamin Peterson29060642009-01-31 22:14:21 +00003284 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003285 return NULL;
3286}
3287
Alexander Belopolsky40018472011-02-26 01:02:56 +00003288PyObject *
3289PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003290 Py_ssize_t size,
3291 const char *encoding,
3292 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293{
3294 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003295
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296 unicode = PyUnicode_FromUnicode(s, size);
3297 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003298 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3300 Py_DECREF(unicode);
3301 return v;
3302}
3303
Alexander Belopolsky40018472011-02-26 01:02:56 +00003304PyObject *
3305PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003306 const char *encoding,
3307 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003308{
3309 PyObject *v;
3310
3311 if (!PyUnicode_Check(unicode)) {
3312 PyErr_BadArgument();
3313 goto onError;
3314 }
3315
3316 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003317 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003318
3319 /* Encode via the codec registry */
3320 v = PyCodec_Encode(unicode, encoding, errors);
3321 if (v == NULL)
3322 goto onError;
3323 return v;
3324
Benjamin Peterson29060642009-01-31 22:14:21 +00003325 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003326 return NULL;
3327}
3328
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003329static size_t
3330wcstombs_errorpos(const wchar_t *wstr)
3331{
3332 size_t len;
3333#if SIZEOF_WCHAR_T == 2
3334 wchar_t buf[3];
3335#else
3336 wchar_t buf[2];
3337#endif
3338 char outbuf[MB_LEN_MAX];
3339 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003340
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003341#if SIZEOF_WCHAR_T == 2
3342 buf[2] = 0;
3343#else
3344 buf[1] = 0;
3345#endif
3346 start = wstr;
3347 while (*wstr != L'\0')
3348 {
3349 previous = wstr;
3350#if SIZEOF_WCHAR_T == 2
3351 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3352 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3353 {
3354 buf[0] = wstr[0];
3355 buf[1] = wstr[1];
3356 wstr += 2;
3357 }
3358 else {
3359 buf[0] = *wstr;
3360 buf[1] = 0;
3361 wstr++;
3362 }
3363#else
3364 buf[0] = *wstr;
3365 wstr++;
3366#endif
3367 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003368 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003369 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003370 }
3371
3372 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003373 return 0;
3374}
3375
Victor Stinner1b579672011-12-17 05:47:23 +01003376static int
3377locale_error_handler(const char *errors, int *surrogateescape)
3378{
Victor Stinner50149202015-09-22 00:26:54 +02003379 _Py_error_handler error_handler = get_error_handler(errors);
3380 switch (error_handler)
3381 {
3382 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003383 *surrogateescape = 0;
3384 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003385 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003386 *surrogateescape = 1;
3387 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003388 default:
3389 PyErr_Format(PyExc_ValueError,
3390 "only 'strict' and 'surrogateescape' error handlers "
3391 "are supported, not '%s'",
3392 errors);
3393 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003394 }
Victor Stinner1b579672011-12-17 05:47:23 +01003395}
3396
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003397PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003398PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003399{
3400 Py_ssize_t wlen, wlen2;
3401 wchar_t *wstr;
3402 PyObject *bytes = NULL;
3403 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003404 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003405 PyObject *exc;
3406 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003407 int surrogateescape;
3408
3409 if (locale_error_handler(errors, &surrogateescape) < 0)
3410 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003411
3412 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3413 if (wstr == NULL)
3414 return NULL;
3415
3416 wlen2 = wcslen(wstr);
3417 if (wlen2 != wlen) {
3418 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003419 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003420 return NULL;
3421 }
3422
3423 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003424 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003425 char *str;
3426
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003427 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003428 if (str == NULL) {
3429 if (error_pos == (size_t)-1) {
3430 PyErr_NoMemory();
3431 PyMem_Free(wstr);
3432 return NULL;
3433 }
3434 else {
3435 goto encode_error;
3436 }
3437 }
3438 PyMem_Free(wstr);
3439
3440 bytes = PyBytes_FromString(str);
3441 PyMem_Free(str);
3442 }
3443 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003444 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003445 size_t len, len2;
3446
3447 len = wcstombs(NULL, wstr, 0);
3448 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003449 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003450 goto encode_error;
3451 }
3452
3453 bytes = PyBytes_FromStringAndSize(NULL, len);
3454 if (bytes == NULL) {
3455 PyMem_Free(wstr);
3456 return NULL;
3457 }
3458
3459 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3460 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003461 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003462 goto encode_error;
3463 }
3464 PyMem_Free(wstr);
3465 }
3466 return bytes;
3467
3468encode_error:
3469 errmsg = strerror(errno);
3470 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003471
3472 if (error_pos == (size_t)-1)
3473 error_pos = wcstombs_errorpos(wstr);
3474
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003475 PyMem_Free(wstr);
3476 Py_XDECREF(bytes);
3477
Victor Stinner2f197072011-12-17 07:08:30 +01003478 if (errmsg != NULL) {
3479 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003480 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003481 if (wstr != NULL) {
3482 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003483 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003484 } else
3485 errmsg = NULL;
3486 }
3487 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003488 reason = PyUnicode_FromString(
3489 "wcstombs() encountered an unencodable "
3490 "wide character");
3491 if (reason == NULL)
3492 return NULL;
3493
3494 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3495 "locale", unicode,
3496 (Py_ssize_t)error_pos,
3497 (Py_ssize_t)(error_pos+1),
3498 reason);
3499 Py_DECREF(reason);
3500 if (exc != NULL) {
3501 PyCodec_StrictErrors(exc);
3502 Py_XDECREF(exc);
3503 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003504 return NULL;
3505}
3506
Victor Stinnerad158722010-10-27 00:25:46 +00003507PyObject *
3508PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003509{
Victor Stinner99b95382011-07-04 14:23:54 +02003510#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003511 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003512#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003513 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003514#else
Victor Stinner793b5312011-04-27 00:24:21 +02003515 PyInterpreterState *interp = PyThreadState_GET()->interp;
3516 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3517 cannot use it to encode and decode filenames before it is loaded. Load
3518 the Python codec requires to encode at least its own filename. Use the C
3519 version of the locale codec until the codec registry is initialized and
3520 the Python codec is loaded.
3521
3522 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3523 cannot only rely on it: check also interp->fscodec_initialized for
3524 subinterpreters. */
3525 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003526 return PyUnicode_AsEncodedString(unicode,
3527 Py_FileSystemDefaultEncoding,
3528 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003529 }
3530 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003531 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003532 }
Victor Stinnerad158722010-10-27 00:25:46 +00003533#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003534}
3535
Alexander Belopolsky40018472011-02-26 01:02:56 +00003536PyObject *
3537PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003538 const char *encoding,
3539 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540{
3541 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003542 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003543
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544 if (!PyUnicode_Check(unicode)) {
3545 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003546 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547 }
Fred Drakee4315f52000-05-09 19:53:39 +00003548
Victor Stinner942889a2016-09-05 15:40:10 -07003549 if (encoding == NULL) {
3550 return _PyUnicode_AsUTF8String(unicode, errors);
3551 }
3552
Fred Drakee4315f52000-05-09 19:53:39 +00003553 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003554 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3555 char *lower = buflower;
3556
3557 /* Fast paths */
3558 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3559 lower += 3;
3560 if (*lower == '_') {
3561 /* Match "utf8" and "utf_8" */
3562 lower++;
3563 }
3564
3565 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003566 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003567 }
3568 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3569 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3570 }
3571 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3572 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3573 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003574 }
Victor Stinner942889a2016-09-05 15:40:10 -07003575 else {
3576 if (strcmp(lower, "ascii") == 0
3577 || strcmp(lower, "us_ascii") == 0) {
3578 return _PyUnicode_AsASCIIString(unicode, errors);
3579 }
Victor Stinner99b95382011-07-04 14:23:54 +02003580#ifdef HAVE_MBCS
Victor Stinner942889a2016-09-05 15:40:10 -07003581 else if (strcmp(lower, "mbcs") == 0) {
3582 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3583 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003584#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003585 else if (strcmp(lower, "latin1") == 0 ||
3586 strcmp(lower, "latin_1") == 0 ||
3587 strcmp(lower, "iso_8859_1") == 0 ||
3588 strcmp(lower, "iso8859_1") == 0) {
3589 return _PyUnicode_AsLatin1String(unicode, errors);
3590 }
3591 }
Victor Stinner37296e82010-06-10 13:36:23 +00003592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593
3594 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003595 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003597 return NULL;
3598
3599 /* The normal path */
3600 if (PyBytes_Check(v))
3601 return v;
3602
3603 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003604 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003605 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003606 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003607
3608 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003609 "encoder %s returned bytearray instead of bytes; "
3610 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003611 encoding);
3612 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003613 Py_DECREF(v);
3614 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003615 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003616
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003617 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3618 Py_DECREF(v);
3619 return b;
3620 }
3621
3622 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003623 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3624 "use codecs.encode() to encode to arbitrary types",
3625 encoding,
3626 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003627 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003628 return NULL;
3629}
3630
Alexander Belopolsky40018472011-02-26 01:02:56 +00003631PyObject *
3632PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003633 const char *encoding,
3634 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003635{
3636 PyObject *v;
3637
3638 if (!PyUnicode_Check(unicode)) {
3639 PyErr_BadArgument();
3640 goto onError;
3641 }
3642
3643 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003644 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003645
3646 /* Encode via the codec registry */
3647 v = PyCodec_Encode(unicode, encoding, errors);
3648 if (v == NULL)
3649 goto onError;
3650 if (!PyUnicode_Check(v)) {
3651 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003652 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3653 "use codecs.encode() to encode to arbitrary types",
3654 encoding,
3655 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003656 Py_DECREF(v);
3657 goto onError;
3658 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003660
Benjamin Peterson29060642009-01-31 22:14:21 +00003661 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 return NULL;
3663}
3664
Victor Stinner2f197072011-12-17 07:08:30 +01003665static size_t
3666mbstowcs_errorpos(const char *str, size_t len)
3667{
3668#ifdef HAVE_MBRTOWC
3669 const char *start = str;
3670 mbstate_t mbs;
3671 size_t converted;
3672 wchar_t ch;
3673
3674 memset(&mbs, 0, sizeof mbs);
3675 while (len)
3676 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003677 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003678 if (converted == 0)
3679 /* Reached end of string */
3680 break;
3681 if (converted == (size_t)-1 || converted == (size_t)-2) {
3682 /* Conversion error or incomplete character */
3683 return str - start;
3684 }
3685 else {
3686 str += converted;
3687 len -= converted;
3688 }
3689 }
3690 /* failed to find the undecodable byte sequence */
3691 return 0;
3692#endif
3693 return 0;
3694}
3695
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003696PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003697PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003698 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003699{
3700 wchar_t smallbuf[256];
3701 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3702 wchar_t *wstr;
3703 size_t wlen, wlen2;
3704 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003705 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003706 size_t error_pos;
3707 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003708 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3709 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003710
3711 if (locale_error_handler(errors, &surrogateescape) < 0)
3712 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003713
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003714 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3715 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003716 return NULL;
3717 }
3718
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003719 if (surrogateescape) {
3720 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003721 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003722 if (wstr == NULL) {
3723 if (wlen == (size_t)-1)
3724 PyErr_NoMemory();
3725 else
3726 PyErr_SetFromErrno(PyExc_OSError);
3727 return NULL;
3728 }
3729
3730 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003731 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003732 }
3733 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003734 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003735#ifndef HAVE_BROKEN_MBSTOWCS
3736 wlen = mbstowcs(NULL, str, 0);
3737#else
3738 wlen = len;
3739#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003740 if (wlen == (size_t)-1)
3741 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003742 if (wlen+1 <= smallbuf_len) {
3743 wstr = smallbuf;
3744 }
3745 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003746 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003747 if (!wstr)
3748 return PyErr_NoMemory();
3749 }
3750
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003751 wlen2 = mbstowcs(wstr, str, wlen+1);
3752 if (wlen2 == (size_t)-1) {
3753 if (wstr != smallbuf)
3754 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003755 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003756 }
3757#ifdef HAVE_BROKEN_MBSTOWCS
3758 assert(wlen2 == wlen);
3759#endif
3760 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3761 if (wstr != smallbuf)
3762 PyMem_Free(wstr);
3763 }
3764 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003765
3766decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003767 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003768 errmsg = strerror(errno);
3769 assert(errmsg != NULL);
3770
3771 error_pos = mbstowcs_errorpos(str, len);
3772 if (errmsg != NULL) {
3773 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003774 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003775 if (wstr != NULL) {
3776 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003777 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003778 }
Victor Stinner2f197072011-12-17 07:08:30 +01003779 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003780 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003781 reason = PyUnicode_FromString(
3782 "mbstowcs() encountered an invalid multibyte sequence");
3783 if (reason == NULL)
3784 return NULL;
3785
3786 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3787 "locale", str, len,
3788 (Py_ssize_t)error_pos,
3789 (Py_ssize_t)(error_pos+1),
3790 reason);
3791 Py_DECREF(reason);
3792 if (exc != NULL) {
3793 PyCodec_StrictErrors(exc);
3794 Py_XDECREF(exc);
3795 }
3796 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003797}
3798
3799PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003800PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003801{
3802 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003803 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003804}
3805
3806
3807PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003808PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003809 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003810 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3811}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003812
Christian Heimes5894ba72007-11-04 11:43:14 +00003813PyObject*
3814PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3815{
Victor Stinner99b95382011-07-04 14:23:54 +02003816#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003817 return PyUnicode_DecodeMBCS(s, size, NULL);
3818#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003819 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003820#else
Victor Stinner793b5312011-04-27 00:24:21 +02003821 PyInterpreterState *interp = PyThreadState_GET()->interp;
3822 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3823 cannot use it to encode and decode filenames before it is loaded. Load
3824 the Python codec requires to encode at least its own filename. Use the C
3825 version of the locale codec until the codec registry is initialized and
3826 the Python codec is loaded.
3827
3828 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3829 cannot only rely on it: check also interp->fscodec_initialized for
3830 subinterpreters. */
3831 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003832 return PyUnicode_Decode(s, size,
3833 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003834 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003835 }
3836 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003837 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003838 }
Victor Stinnerad158722010-10-27 00:25:46 +00003839#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003840}
3841
Martin v. Löwis011e8422009-05-05 04:43:17 +00003842
3843int
3844PyUnicode_FSConverter(PyObject* arg, void* addr)
3845{
Brett Cannonec6ce872016-09-06 15:50:29 -07003846 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003847 PyObject *output = NULL;
3848 Py_ssize_t size;
3849 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003850 if (arg == NULL) {
3851 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003852 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003853 return 1;
3854 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003855 path = PyOS_FSPath(arg);
3856 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003857 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003858 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003859 if (PyBytes_Check(path)) {
3860 output = path;
3861 }
3862 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3863 output = PyUnicode_EncodeFSDefault(path);
3864 Py_DECREF(path);
3865 if (!output) {
3866 return 0;
3867 }
3868 assert(PyBytes_Check(output));
3869 }
3870
Victor Stinner0ea2a462010-04-30 00:22:08 +00003871 size = PyBytes_GET_SIZE(output);
3872 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003873 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003874 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003875 Py_DECREF(output);
3876 return 0;
3877 }
3878 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003879 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003880}
3881
3882
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003883int
3884PyUnicode_FSDecoder(PyObject* arg, void* addr)
3885{
Brett Cannona5711202016-09-06 19:36:01 -07003886 int is_buffer = 0;
3887 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003888 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003889 if (arg == NULL) {
3890 Py_DECREF(*(PyObject**)addr);
3891 return 1;
3892 }
Brett Cannona5711202016-09-06 19:36:01 -07003893
3894 is_buffer = PyObject_CheckBuffer(arg);
3895 if (!is_buffer) {
3896 path = PyOS_FSPath(arg);
3897 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003898 return 0;
3899 }
Brett Cannona5711202016-09-06 19:36:01 -07003900 }
3901 else {
3902 path = arg;
3903 Py_INCREF(arg);
3904 }
3905
3906 if (PyUnicode_Check(path)) {
3907 if (PyUnicode_READY(path) == -1) {
3908 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003909 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003910 }
3911 output = path;
3912 }
3913 else if (PyBytes_Check(path) || is_buffer) {
3914 PyObject *path_bytes = NULL;
3915
3916 if (!PyBytes_Check(path) &&
3917 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3918 "path should be string, bytes, or os.PathLike, not %.200s",
3919 Py_TYPE(arg)->tp_name)) {
3920 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003921 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003922 }
3923 path_bytes = PyBytes_FromObject(path);
3924 Py_DECREF(path);
3925 if (!path_bytes) {
3926 return 0;
3927 }
3928 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3929 PyBytes_GET_SIZE(path_bytes));
3930 Py_DECREF(path_bytes);
3931 if (!output) {
3932 return 0;
3933 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003934 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003935 else {
3936 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003937 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003938 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003939 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003940 return 0;
3941 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003942 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003943 Py_DECREF(output);
3944 return 0;
3945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003946 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003947 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003948 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003949 Py_DECREF(output);
3950 return 0;
3951 }
3952 *(PyObject**)addr = output;
3953 return Py_CLEANUP_SUPPORTED;
3954}
3955
3956
Martin v. Löwis5b222132007-06-10 09:51:05 +00003957char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003958PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003959{
Christian Heimesf3863112007-11-22 07:46:41 +00003960 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003961
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003962 if (!PyUnicode_Check(unicode)) {
3963 PyErr_BadArgument();
3964 return NULL;
3965 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003966 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003967 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003968
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003969 if (PyUnicode_UTF8(unicode) == NULL) {
3970 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003971 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003972 if (bytes == NULL)
3973 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003974 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3975 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003976 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003977 Py_DECREF(bytes);
3978 return NULL;
3979 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003980 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3981 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3982 PyBytes_AS_STRING(bytes),
3983 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003984 Py_DECREF(bytes);
3985 }
3986
3987 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003988 *psize = PyUnicode_UTF8_LENGTH(unicode);
3989 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003990}
3991
3992char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003993PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003994{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3996}
3997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998Py_UNICODE *
3999PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4000{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001 const unsigned char *one_byte;
4002#if SIZEOF_WCHAR_T == 4
4003 const Py_UCS2 *two_bytes;
4004#else
4005 const Py_UCS4 *four_bytes;
4006 const Py_UCS4 *ucs4_end;
4007 Py_ssize_t num_surrogates;
4008#endif
4009 wchar_t *w;
4010 wchar_t *wchar_end;
4011
4012 if (!PyUnicode_Check(unicode)) {
4013 PyErr_BadArgument();
4014 return NULL;
4015 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004016 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004017 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004018 assert(_PyUnicode_KIND(unicode) != 0);
4019 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004020
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004021 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004022#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004023 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4024 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004025 num_surrogates = 0;
4026
4027 for (; four_bytes < ucs4_end; ++four_bytes) {
4028 if (*four_bytes > 0xFFFF)
4029 ++num_surrogates;
4030 }
4031
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004032 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4033 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4034 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035 PyErr_NoMemory();
4036 return NULL;
4037 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004038 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004039
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004040 w = _PyUnicode_WSTR(unicode);
4041 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4042 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004043 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4044 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004045 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004046 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004047 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4048 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004049 }
4050 else
4051 *w = *four_bytes;
4052
4053 if (w > wchar_end) {
4054 assert(0 && "Miscalculated string end");
4055 }
4056 }
4057 *w = 0;
4058#else
4059 /* sizeof(wchar_t) == 4 */
4060 Py_FatalError("Impossible unicode object state, wstr and str "
4061 "should share memory already.");
4062 return NULL;
4063#endif
4064 }
4065 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004066 if ((size_t)_PyUnicode_LENGTH(unicode) >
4067 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4068 PyErr_NoMemory();
4069 return NULL;
4070 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004071 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4072 (_PyUnicode_LENGTH(unicode) + 1));
4073 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004074 PyErr_NoMemory();
4075 return NULL;
4076 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004077 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4078 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4079 w = _PyUnicode_WSTR(unicode);
4080 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004081
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004082 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4083 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004084 for (; w < wchar_end; ++one_byte, ++w)
4085 *w = *one_byte;
4086 /* null-terminate the wstr */
4087 *w = 0;
4088 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004089 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004090#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004091 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004092 for (; w < wchar_end; ++two_bytes, ++w)
4093 *w = *two_bytes;
4094 /* null-terminate the wstr */
4095 *w = 0;
4096#else
4097 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004098 PyObject_FREE(_PyUnicode_WSTR(unicode));
4099 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004100 Py_FatalError("Impossible unicode object state, wstr "
4101 "and str should share memory already.");
4102 return NULL;
4103#endif
4104 }
4105 else {
4106 assert(0 && "This should never happen.");
4107 }
4108 }
4109 }
4110 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004111 *size = PyUnicode_WSTR_LENGTH(unicode);
4112 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004113}
4114
Alexander Belopolsky40018472011-02-26 01:02:56 +00004115Py_UNICODE *
4116PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004118 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119}
4120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004121
Alexander Belopolsky40018472011-02-26 01:02:56 +00004122Py_ssize_t
4123PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124{
4125 if (!PyUnicode_Check(unicode)) {
4126 PyErr_BadArgument();
4127 goto onError;
4128 }
4129 return PyUnicode_GET_SIZE(unicode);
4130
Benjamin Peterson29060642009-01-31 22:14:21 +00004131 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 return -1;
4133}
4134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004135Py_ssize_t
4136PyUnicode_GetLength(PyObject *unicode)
4137{
Victor Stinner07621332012-06-16 04:53:46 +02004138 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004139 PyErr_BadArgument();
4140 return -1;
4141 }
Victor Stinner07621332012-06-16 04:53:46 +02004142 if (PyUnicode_READY(unicode) == -1)
4143 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004144 return PyUnicode_GET_LENGTH(unicode);
4145}
4146
4147Py_UCS4
4148PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4149{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004150 void *data;
4151 int kind;
4152
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004153 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4154 PyErr_BadArgument();
4155 return (Py_UCS4)-1;
4156 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004157 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004158 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004159 return (Py_UCS4)-1;
4160 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004161 data = PyUnicode_DATA(unicode);
4162 kind = PyUnicode_KIND(unicode);
4163 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004164}
4165
4166int
4167PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4168{
4169 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004170 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004171 return -1;
4172 }
Victor Stinner488fa492011-12-12 00:01:39 +01004173 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004174 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004175 PyErr_SetString(PyExc_IndexError, "string index out of range");
4176 return -1;
4177 }
Victor Stinner488fa492011-12-12 00:01:39 +01004178 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004179 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004180 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4181 PyErr_SetString(PyExc_ValueError, "character out of range");
4182 return -1;
4183 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004184 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4185 index, ch);
4186 return 0;
4187}
4188
Alexander Belopolsky40018472011-02-26 01:02:56 +00004189const char *
4190PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004191{
Victor Stinner42cb4622010-09-01 19:39:01 +00004192 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004193}
4194
Victor Stinner554f3f02010-06-16 23:33:54 +00004195/* create or adjust a UnicodeDecodeError */
4196static void
4197make_decode_exception(PyObject **exceptionObject,
4198 const char *encoding,
4199 const char *input, Py_ssize_t length,
4200 Py_ssize_t startpos, Py_ssize_t endpos,
4201 const char *reason)
4202{
4203 if (*exceptionObject == NULL) {
4204 *exceptionObject = PyUnicodeDecodeError_Create(
4205 encoding, input, length, startpos, endpos, reason);
4206 }
4207 else {
4208 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4209 goto onError;
4210 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4211 goto onError;
4212 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4213 goto onError;
4214 }
4215 return;
4216
4217onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004218 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004219}
4220
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004221#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004222/* error handling callback helper:
4223 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004224 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004225 and adjust various state variables.
4226 return 0 on success, -1 on error
4227*/
4228
Alexander Belopolsky40018472011-02-26 01:02:56 +00004229static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004230unicode_decode_call_errorhandler_wchar(
4231 const char *errors, PyObject **errorHandler,
4232 const char *encoding, const char *reason,
4233 const char **input, const char **inend, Py_ssize_t *startinpos,
4234 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4235 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004237 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004238
4239 PyObject *restuple = NULL;
4240 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004241 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004242 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004243 Py_ssize_t requiredsize;
4244 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004245 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004246 wchar_t *repwstr;
4247 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004249 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4250 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004251
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004253 *errorHandler = PyCodec_LookupError(errors);
4254 if (*errorHandler == NULL)
4255 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256 }
4257
Victor Stinner554f3f02010-06-16 23:33:54 +00004258 make_decode_exception(exceptionObject,
4259 encoding,
4260 *input, *inend - *input,
4261 *startinpos, *endinpos,
4262 reason);
4263 if (*exceptionObject == NULL)
4264 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004265
4266 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4267 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004270 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004271 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272 }
4273 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004275
4276 /* Copy back the bytes variables, which might have been modified by the
4277 callback */
4278 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4279 if (!inputobj)
4280 goto onError;
4281 if (!PyBytes_Check(inputobj)) {
4282 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4283 }
4284 *input = PyBytes_AS_STRING(inputobj);
4285 insize = PyBytes_GET_SIZE(inputobj);
4286 *inend = *input + insize;
4287 /* we can DECREF safely, as the exception has another reference,
4288 so the object won't go away. */
4289 Py_DECREF(inputobj);
4290
4291 if (newpos<0)
4292 newpos = insize+newpos;
4293 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004294 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004295 goto onError;
4296 }
4297
4298 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4299 if (repwstr == NULL)
4300 goto onError;
4301 /* need more space? (at least enough for what we
4302 have+the replacement+the rest of the string (starting
4303 at the new input position), so we won't have to check space
4304 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004305 requiredsize = *outpos;
4306 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4307 goto overflow;
4308 requiredsize += repwlen;
4309 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4310 goto overflow;
4311 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004312 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004313 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004314 requiredsize = 2*outsize;
4315 if (unicode_resize(output, requiredsize) < 0)
4316 goto onError;
4317 }
4318 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4319 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004320 *endinpos = newpos;
4321 *inptr = *input + newpos;
4322
4323 /* we made it! */
4324 Py_XDECREF(restuple);
4325 return 0;
4326
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004327 overflow:
4328 PyErr_SetString(PyExc_OverflowError,
4329 "decoded result is too long for a Python string");
4330
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004331 onError:
4332 Py_XDECREF(restuple);
4333 return -1;
4334}
4335#endif /* HAVE_MBCS */
4336
4337static int
4338unicode_decode_call_errorhandler_writer(
4339 const char *errors, PyObject **errorHandler,
4340 const char *encoding, const char *reason,
4341 const char **input, const char **inend, Py_ssize_t *startinpos,
4342 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4343 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4344{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004345 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004346
4347 PyObject *restuple = NULL;
4348 PyObject *repunicode = NULL;
4349 Py_ssize_t insize;
4350 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004351 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004352 PyObject *inputobj = NULL;
4353
4354 if (*errorHandler == NULL) {
4355 *errorHandler = PyCodec_LookupError(errors);
4356 if (*errorHandler == NULL)
4357 goto onError;
4358 }
4359
4360 make_decode_exception(exceptionObject,
4361 encoding,
4362 *input, *inend - *input,
4363 *startinpos, *endinpos,
4364 reason);
4365 if (*exceptionObject == NULL)
4366 goto onError;
4367
4368 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4369 if (restuple == NULL)
4370 goto onError;
4371 if (!PyTuple_Check(restuple)) {
4372 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4373 goto onError;
4374 }
4375 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004376 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004377
4378 /* Copy back the bytes variables, which might have been modified by the
4379 callback */
4380 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4381 if (!inputobj)
4382 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004383 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004384 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004385 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004386 *input = PyBytes_AS_STRING(inputobj);
4387 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004388 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004389 /* we can DECREF safely, as the exception has another reference,
4390 so the object won't go away. */
4391 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004394 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004395 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004396 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004397 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004398 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004399
Victor Stinner8f674cc2013-04-17 23:02:17 +02004400 if (PyUnicode_READY(repunicode) < 0)
4401 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004402 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004403 if (replen > 1) {
4404 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004405 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004406 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4407 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4408 goto onError;
4409 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004410 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004411 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004412
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004414 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004415
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004417 Py_XDECREF(restuple);
4418 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419
Benjamin Peterson29060642009-01-31 22:14:21 +00004420 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004422 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423}
4424
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004425/* --- UTF-7 Codec -------------------------------------------------------- */
4426
Antoine Pitrou244651a2009-05-04 18:56:13 +00004427/* See RFC2152 for details. We encode conservatively and decode liberally. */
4428
4429/* Three simple macros defining base-64. */
4430
4431/* Is c a base-64 character? */
4432
4433#define IS_BASE64(c) \
4434 (((c) >= 'A' && (c) <= 'Z') || \
4435 ((c) >= 'a' && (c) <= 'z') || \
4436 ((c) >= '0' && (c) <= '9') || \
4437 (c) == '+' || (c) == '/')
4438
4439/* given that c is a base-64 character, what is its base-64 value? */
4440
4441#define FROM_BASE64(c) \
4442 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4443 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4444 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4445 (c) == '+' ? 62 : 63)
4446
4447/* What is the base-64 character of the bottom 6 bits of n? */
4448
4449#define TO_BASE64(n) \
4450 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4451
4452/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4453 * decoded as itself. We are permissive on decoding; the only ASCII
4454 * byte not decoding to itself is the + which begins a base64
4455 * string. */
4456
4457#define DECODE_DIRECT(c) \
4458 ((c) <= 127 && (c) != '+')
4459
4460/* The UTF-7 encoder treats ASCII characters differently according to
4461 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4462 * the above). See RFC2152. This array identifies these different
4463 * sets:
4464 * 0 : "Set D"
4465 * alphanumeric and '(),-./:?
4466 * 1 : "Set O"
4467 * !"#$%&*;<=>@[]^_`{|}
4468 * 2 : "whitespace"
4469 * ht nl cr sp
4470 * 3 : special (must be base64 encoded)
4471 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4472 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004473
Tim Petersced69f82003-09-16 20:30:58 +00004474static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004475char utf7_category[128] = {
4476/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4477 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4478/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4479 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4480/* sp ! " # $ % & ' ( ) * + , - . / */
4481 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4482/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4484/* @ A B C D E F G H I J K L M N O */
4485 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4486/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4488/* ` a b c d e f g h i j k l m n o */
4489 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4490/* p q r s t u v w x y z { | } ~ del */
4491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492};
4493
Antoine Pitrou244651a2009-05-04 18:56:13 +00004494/* ENCODE_DIRECT: this character should be encoded as itself. The
4495 * answer depends on whether we are encoding set O as itself, and also
4496 * on whether we are encoding whitespace as itself. RFC2152 makes it
4497 * clear that the answers to these questions vary between
4498 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004499
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500#define ENCODE_DIRECT(c, directO, directWS) \
4501 ((c) < 128 && (c) > 0 && \
4502 ((utf7_category[(c)] == 0) || \
4503 (directWS && (utf7_category[(c)] == 2)) || \
4504 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004505
Alexander Belopolsky40018472011-02-26 01:02:56 +00004506PyObject *
4507PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004508 Py_ssize_t size,
4509 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004510{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004511 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4512}
4513
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514/* The decoder. The only state we preserve is our read position,
4515 * i.e. how many characters we have consumed. So if we end in the
4516 * middle of a shift sequence we have to back off the read position
4517 * and the output to the beginning of the sequence, otherwise we lose
4518 * all the shift state (seen bits, number of bits seen, high
4519 * surrogate). */
4520
Alexander Belopolsky40018472011-02-26 01:02:56 +00004521PyObject *
4522PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004523 Py_ssize_t size,
4524 const char *errors,
4525 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004526{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004528 Py_ssize_t startinpos;
4529 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004531 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 const char *errmsg = "";
4533 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004534 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535 unsigned int base64bits = 0;
4536 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004537 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538 PyObject *errorHandler = NULL;
4539 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004540
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004541 if (size == 0) {
4542 if (consumed)
4543 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004544 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004545 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004546
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004547 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004548 _PyUnicodeWriter_Init(&writer);
4549 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004550
4551 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004552 e = s + size;
4553
4554 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004555 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004556 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004557 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004558
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 if (inShift) { /* in a base-64 section */
4560 if (IS_BASE64(ch)) { /* consume a base-64 character */
4561 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4562 base64bits += 6;
4563 s++;
4564 if (base64bits >= 16) {
4565 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004566 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 base64bits -= 16;
4568 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004569 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 if (surrogate) {
4571 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004572 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4573 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004574 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004575 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004577 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 }
4579 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004580 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004581 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 }
4584 }
Victor Stinner551ac952011-11-29 22:58:13 +01004585 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004586 /* first surrogate */
4587 surrogate = outCh;
4588 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004589 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004590 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004591 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592 }
4593 }
4594 }
4595 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004596 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 if (base64bits > 0) { /* left-over bits */
4598 if (base64bits >= 6) {
4599 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004600 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004601 errmsg = "partial character in shift sequence";
4602 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004603 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004604 else {
4605 /* Some bits remain; they should be zero */
4606 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004607 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 errmsg = "non-zero padding bits in shift sequence";
4609 goto utf7Error;
4610 }
4611 }
4612 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004613 if (surrogate && DECODE_DIRECT(ch)) {
4614 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4615 goto onError;
4616 }
4617 surrogate = 0;
4618 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619 /* '-' is absorbed; other terminating
4620 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004621 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004622 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004623 }
4624 }
4625 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004627 s++; /* consume '+' */
4628 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004629 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004630 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004631 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004632 }
4633 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004634 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004635 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004636 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004637 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004638 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004639 }
4640 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004641 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004642 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004643 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004644 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004645 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004646 else {
4647 startinpos = s-starts;
4648 s++;
4649 errmsg = "unexpected special character";
4650 goto utf7Error;
4651 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004652 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004653utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004654 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004655 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004656 errors, &errorHandler,
4657 "utf7", errmsg,
4658 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004659 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004660 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004661 }
4662
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 /* end of string */
4664
4665 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4666 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004667 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004668 if (surrogate ||
4669 (base64bits >= 6) ||
4670 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004671 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004672 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673 errors, &errorHandler,
4674 "utf7", "unterminated shift sequence",
4675 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004676 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004677 goto onError;
4678 if (s < e)
4679 goto restart;
4680 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004681 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004682
4683 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004684 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004685 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004686 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004687 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004688 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004689 writer.kind, writer.data, shiftOutStart);
4690 Py_XDECREF(errorHandler);
4691 Py_XDECREF(exc);
4692 _PyUnicodeWriter_Dealloc(&writer);
4693 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004694 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004695 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004696 }
4697 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004698 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004700 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004701
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004702 Py_XDECREF(errorHandler);
4703 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004704 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004705
Benjamin Peterson29060642009-01-31 22:14:21 +00004706 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004707 Py_XDECREF(errorHandler);
4708 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004709 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004710 return NULL;
4711}
4712
4713
Alexander Belopolsky40018472011-02-26 01:02:56 +00004714PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004715_PyUnicode_EncodeUTF7(PyObject *str,
4716 int base64SetO,
4717 int base64WhiteSpace,
4718 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004719{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004720 int kind;
4721 void *data;
4722 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004723 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004724 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004725 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004726 unsigned int base64bits = 0;
4727 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004728 char * out;
4729 char * start;
4730
Benjamin Petersonbac79492012-01-14 13:34:47 -05004731 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004732 return NULL;
4733 kind = PyUnicode_KIND(str);
4734 data = PyUnicode_DATA(str);
4735 len = PyUnicode_GET_LENGTH(str);
4736
4737 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004738 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004739
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004740 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004741 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004742 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004743 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004744 if (v == NULL)
4745 return NULL;
4746
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004747 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004748 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004749 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004750
Antoine Pitrou244651a2009-05-04 18:56:13 +00004751 if (inShift) {
4752 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4753 /* shifting out */
4754 if (base64bits) { /* output remaining bits */
4755 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4756 base64buffer = 0;
4757 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004758 }
4759 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004760 /* Characters not in the BASE64 set implicitly unshift the sequence
4761 so no '-' is required, except if the character is itself a '-' */
4762 if (IS_BASE64(ch) || ch == '-') {
4763 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004764 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004765 *out++ = (char) ch;
4766 }
4767 else {
4768 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004769 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004770 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004771 else { /* not in a shift sequence */
4772 if (ch == '+') {
4773 *out++ = '+';
4774 *out++ = '-';
4775 }
4776 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4777 *out++ = (char) ch;
4778 }
4779 else {
4780 *out++ = '+';
4781 inShift = 1;
4782 goto encode_char;
4783 }
4784 }
4785 continue;
4786encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004787 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004788 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004789
Antoine Pitrou244651a2009-05-04 18:56:13 +00004790 /* code first surrogate */
4791 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004792 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004793 while (base64bits >= 6) {
4794 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4795 base64bits -= 6;
4796 }
4797 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004798 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004799 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004800 base64bits += 16;
4801 base64buffer = (base64buffer << 16) | ch;
4802 while (base64bits >= 6) {
4803 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4804 base64bits -= 6;
4805 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004806 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004807 if (base64bits)
4808 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4809 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004810 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004811 if (_PyBytes_Resize(&v, out - start) < 0)
4812 return NULL;
4813 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004814}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004815PyObject *
4816PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4817 Py_ssize_t size,
4818 int base64SetO,
4819 int base64WhiteSpace,
4820 const char *errors)
4821{
4822 PyObject *result;
4823 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4824 if (tmp == NULL)
4825 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004826 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004827 base64WhiteSpace, errors);
4828 Py_DECREF(tmp);
4829 return result;
4830}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004831
Antoine Pitrou244651a2009-05-04 18:56:13 +00004832#undef IS_BASE64
4833#undef FROM_BASE64
4834#undef TO_BASE64
4835#undef DECODE_DIRECT
4836#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004837
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838/* --- UTF-8 Codec -------------------------------------------------------- */
4839
Alexander Belopolsky40018472011-02-26 01:02:56 +00004840PyObject *
4841PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004842 Py_ssize_t size,
4843 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844{
Walter Dörwald69652032004-09-07 20:24:22 +00004845 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4846}
4847
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004848#include "stringlib/asciilib.h"
4849#include "stringlib/codecs.h"
4850#include "stringlib/undef.h"
4851
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004852#include "stringlib/ucs1lib.h"
4853#include "stringlib/codecs.h"
4854#include "stringlib/undef.h"
4855
4856#include "stringlib/ucs2lib.h"
4857#include "stringlib/codecs.h"
4858#include "stringlib/undef.h"
4859
4860#include "stringlib/ucs4lib.h"
4861#include "stringlib/codecs.h"
4862#include "stringlib/undef.h"
4863
Antoine Pitrouab868312009-01-10 15:40:25 +00004864/* Mask to quickly check whether a C 'long' contains a
4865 non-ASCII, UTF8-encoded char. */
4866#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004867# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004868#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004869# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004870#else
4871# error C 'long' size should be either 4 or 8!
4872#endif
4873
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004874static Py_ssize_t
4875ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004876{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004877 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004878 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004879
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004880 /*
4881 * Issue #17237: m68k is a bit different from most architectures in
4882 * that objects do not use "natural alignment" - for example, int and
4883 * long are only aligned at 2-byte boundaries. Therefore the assert()
4884 * won't work; also, tests have shown that skipping the "optimised
4885 * version" will even speed up m68k.
4886 */
4887#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004889 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4890 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004891 /* Fast path, see in STRINGLIB(utf8_decode) for
4892 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004893 /* Help allocation */
4894 const char *_p = p;
4895 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004896 while (_p < aligned_end) {
4897 unsigned long value = *(const unsigned long *) _p;
4898 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004899 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004900 *((unsigned long *)q) = value;
4901 _p += SIZEOF_LONG;
4902 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004903 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004904 p = _p;
4905 while (p < end) {
4906 if ((unsigned char)*p & 0x80)
4907 break;
4908 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004910 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004912#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004913#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914 while (p < end) {
4915 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4916 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004917 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004918 /* Help allocation */
4919 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004920 while (_p < aligned_end) {
4921 unsigned long value = *(unsigned long *) _p;
4922 if (value & ASCII_CHAR_MASK)
4923 break;
4924 _p += SIZEOF_LONG;
4925 }
4926 p = _p;
4927 if (_p == end)
4928 break;
4929 }
4930 if ((unsigned char)*p & 0x80)
4931 break;
4932 ++p;
4933 }
4934 memcpy(dest, start, p - start);
4935 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936}
Antoine Pitrouab868312009-01-10 15:40:25 +00004937
Victor Stinner785938e2011-12-11 20:09:03 +01004938PyObject *
4939PyUnicode_DecodeUTF8Stateful(const char *s,
4940 Py_ssize_t size,
4941 const char *errors,
4942 Py_ssize_t *consumed)
4943{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004944 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004945 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004946 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004947
4948 Py_ssize_t startinpos;
4949 Py_ssize_t endinpos;
4950 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004951 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004952 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004953 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004954
4955 if (size == 0) {
4956 if (consumed)
4957 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004958 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004959 }
4960
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4962 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004963 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004964 *consumed = 1;
4965 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004966 }
4967
Victor Stinner8f674cc2013-04-17 23:02:17 +02004968 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004969 writer.min_length = size;
4970 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004971 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004972
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004973 writer.pos = ascii_decode(s, end, writer.data);
4974 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004975 while (s < end) {
4976 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004977 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004978
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004979 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004980 if (PyUnicode_IS_ASCII(writer.buffer))
4981 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004982 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004983 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004984 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004985 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004986 } else {
4987 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004988 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 }
4990
4991 switch (ch) {
4992 case 0:
4993 if (s == end || consumed)
4994 goto End;
4995 errmsg = "unexpected end of data";
4996 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004997 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004998 break;
4999 case 1:
5000 errmsg = "invalid start byte";
5001 startinpos = s - starts;
5002 endinpos = startinpos + 1;
5003 break;
5004 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005005 case 3:
5006 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005007 errmsg = "invalid continuation byte";
5008 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005009 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005010 break;
5011 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005012 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005013 goto onError;
5014 continue;
5015 }
5016
Victor Stinner1d65d912015-10-05 13:43:50 +02005017 if (error_handler == _Py_ERROR_UNKNOWN)
5018 error_handler = get_error_handler(errors);
5019
5020 switch (error_handler) {
5021 case _Py_ERROR_IGNORE:
5022 s += (endinpos - startinpos);
5023 break;
5024
5025 case _Py_ERROR_REPLACE:
5026 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5027 goto onError;
5028 s += (endinpos - startinpos);
5029 break;
5030
5031 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005032 {
5033 Py_ssize_t i;
5034
Victor Stinner1d65d912015-10-05 13:43:50 +02005035 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5036 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005037 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005038 ch = (Py_UCS4)(unsigned char)(starts[i]);
5039 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5040 ch + 0xdc00);
5041 writer.pos++;
5042 }
5043 s += (endinpos - startinpos);
5044 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005045 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005046
5047 default:
5048 if (unicode_decode_call_errorhandler_writer(
5049 errors, &error_handler_obj,
5050 "utf-8", errmsg,
5051 &starts, &end, &startinpos, &endinpos, &exc, &s,
5052 &writer))
5053 goto onError;
5054 }
Victor Stinner785938e2011-12-11 20:09:03 +01005055 }
5056
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005057End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005058 if (consumed)
5059 *consumed = s - starts;
5060
Victor Stinner1d65d912015-10-05 13:43:50 +02005061 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005062 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005063 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005064
5065onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005066 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005067 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005068 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005069 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005070}
5071
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005072#ifdef __APPLE__
5073
5074/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005075 used to decode the command line arguments on Mac OS X.
5076
5077 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005078 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005079
5080wchar_t*
5081_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5082{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005083 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005084 wchar_t *unicode;
5085 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005086
5087 /* Note: size will always be longer than the resulting Unicode
5088 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005089 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005090 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005091 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005092 if (!unicode)
5093 return NULL;
5094
5095 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005096 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005097 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005098 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005099 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005100#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005101 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005102#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005103 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005104#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005105 if (ch > 0xFF) {
5106#if SIZEOF_WCHAR_T == 4
5107 assert(0);
5108#else
5109 assert(Py_UNICODE_IS_SURROGATE(ch));
5110 /* compute and append the two surrogates: */
5111 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5112 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5113#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005114 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005115 else {
5116 if (!ch && s == e)
5117 break;
5118 /* surrogateescape */
5119 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5120 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005121 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005122 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005123 return unicode;
5124}
5125
5126#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005128/* Primary internal function which creates utf8 encoded bytes objects.
5129
5130 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005131 and allocate exactly as much space needed at the end. Else allocate the
5132 maximum possible needed (4 result bytes per Unicode character), and return
5133 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005134*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005135PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005136_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137{
Victor Stinner6099a032011-12-18 14:22:26 +01005138 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005139 void *data;
5140 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005142 if (!PyUnicode_Check(unicode)) {
5143 PyErr_BadArgument();
5144 return NULL;
5145 }
5146
5147 if (PyUnicode_READY(unicode) == -1)
5148 return NULL;
5149
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005150 if (PyUnicode_UTF8(unicode))
5151 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5152 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005153
5154 kind = PyUnicode_KIND(unicode);
5155 data = PyUnicode_DATA(unicode);
5156 size = PyUnicode_GET_LENGTH(unicode);
5157
Benjamin Petersonead6b532011-12-20 17:23:42 -06005158 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005159 default:
5160 assert(0);
5161 case PyUnicode_1BYTE_KIND:
5162 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5163 assert(!PyUnicode_IS_ASCII(unicode));
5164 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5165 case PyUnicode_2BYTE_KIND:
5166 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5167 case PyUnicode_4BYTE_KIND:
5168 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170}
5171
Alexander Belopolsky40018472011-02-26 01:02:56 +00005172PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005173PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5174 Py_ssize_t size,
5175 const char *errors)
5176{
5177 PyObject *v, *unicode;
5178
5179 unicode = PyUnicode_FromUnicode(s, size);
5180 if (unicode == NULL)
5181 return NULL;
5182 v = _PyUnicode_AsUTF8String(unicode, errors);
5183 Py_DECREF(unicode);
5184 return v;
5185}
5186
5187PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005188PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005190 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191}
5192
Walter Dörwald41980ca2007-08-16 21:55:45 +00005193/* --- UTF-32 Codec ------------------------------------------------------- */
5194
5195PyObject *
5196PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 Py_ssize_t size,
5198 const char *errors,
5199 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005200{
5201 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5202}
5203
5204PyObject *
5205PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 Py_ssize_t size,
5207 const char *errors,
5208 int *byteorder,
5209 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005210{
5211 const char *starts = s;
5212 Py_ssize_t startinpos;
5213 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005214 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005215 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005216 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005217 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005218 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005219 PyObject *errorHandler = NULL;
5220 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005221
Walter Dörwald41980ca2007-08-16 21:55:45 +00005222 q = (unsigned char *)s;
5223 e = q + size;
5224
5225 if (byteorder)
5226 bo = *byteorder;
5227
5228 /* Check for BOM marks (U+FEFF) in the input and adjust current
5229 byte order setting accordingly. In native mode, the leading BOM
5230 mark is skipped, in all other modes, it is copied to the output
5231 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005232 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005233 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005234 if (bom == 0x0000FEFF) {
5235 bo = -1;
5236 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005237 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005238 else if (bom == 0xFFFE0000) {
5239 bo = 1;
5240 q += 4;
5241 }
5242 if (byteorder)
5243 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005244 }
5245
Victor Stinnere64322e2012-10-30 23:12:47 +01005246 if (q == e) {
5247 if (consumed)
5248 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005249 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005250 }
5251
Victor Stinnere64322e2012-10-30 23:12:47 +01005252#ifdef WORDS_BIGENDIAN
5253 le = bo < 0;
5254#else
5255 le = bo <= 0;
5256#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005257 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005258
Victor Stinner8f674cc2013-04-17 23:02:17 +02005259 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005260 writer.min_length = (e - q + 3) / 4;
5261 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005262 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005263
Victor Stinnere64322e2012-10-30 23:12:47 +01005264 while (1) {
5265 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005266 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005267
Victor Stinnere64322e2012-10-30 23:12:47 +01005268 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005269 enum PyUnicode_Kind kind = writer.kind;
5270 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005271 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005272 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005273 if (le) {
5274 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005275 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005276 if (ch > maxch)
5277 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005278 if (kind != PyUnicode_1BYTE_KIND &&
5279 Py_UNICODE_IS_SURROGATE(ch))
5280 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005281 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005282 q += 4;
5283 } while (q <= last);
5284 }
5285 else {
5286 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005287 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005288 if (ch > maxch)
5289 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005290 if (kind != PyUnicode_1BYTE_KIND &&
5291 Py_UNICODE_IS_SURROGATE(ch))
5292 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005293 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005294 q += 4;
5295 } while (q <= last);
5296 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005297 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005298 }
5299
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005300 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005301 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005302 startinpos = ((const char *)q) - starts;
5303 endinpos = startinpos + 4;
5304 }
5305 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005306 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005308 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005309 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005310 startinpos = ((const char *)q) - starts;
5311 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005313 else {
5314 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005315 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005316 goto onError;
5317 q += 4;
5318 continue;
5319 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005320 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005321 startinpos = ((const char *)q) - starts;
5322 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005324
5325 /* The remaining input chars are ignored if the callback
5326 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005327 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005328 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005329 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005330 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005331 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005332 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005333 }
5334
Walter Dörwald41980ca2007-08-16 21:55:45 +00005335 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005336 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005337
Walter Dörwald41980ca2007-08-16 21:55:45 +00005338 Py_XDECREF(errorHandler);
5339 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005340 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005341
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005343 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005344 Py_XDECREF(errorHandler);
5345 Py_XDECREF(exc);
5346 return NULL;
5347}
5348
5349PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005350_PyUnicode_EncodeUTF32(PyObject *str,
5351 const char *errors,
5352 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005353{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005354 enum PyUnicode_Kind kind;
5355 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005356 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005357 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005358 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005359#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005360 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005361#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005362 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005363#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005364 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005365 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005366 PyObject *errorHandler = NULL;
5367 PyObject *exc = NULL;
5368 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005369
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005370 if (!PyUnicode_Check(str)) {
5371 PyErr_BadArgument();
5372 return NULL;
5373 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005374 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005375 return NULL;
5376 kind = PyUnicode_KIND(str);
5377 data = PyUnicode_DATA(str);
5378 len = PyUnicode_GET_LENGTH(str);
5379
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005380 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005381 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005382 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005383 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005384 if (v == NULL)
5385 return NULL;
5386
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005387 /* output buffer is 4-bytes aligned */
5388 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005389 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005390 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005391 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005392 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005393 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005394
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005395 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005396 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005397 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005398 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005399 else
5400 encoding = "utf-32";
5401
5402 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005403 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5404 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005405 }
5406
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005407 pos = 0;
5408 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005409 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005410
5411 if (kind == PyUnicode_2BYTE_KIND) {
5412 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5413 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005414 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005415 else {
5416 assert(kind == PyUnicode_4BYTE_KIND);
5417 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5418 &out, native_ordering);
5419 }
5420 if (pos == len)
5421 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005422
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005423 rep = unicode_encode_call_errorhandler(
5424 errors, &errorHandler,
5425 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005426 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005427 if (!rep)
5428 goto error;
5429
5430 if (PyBytes_Check(rep)) {
5431 repsize = PyBytes_GET_SIZE(rep);
5432 if (repsize & 3) {
5433 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005434 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005435 "surrogates not allowed");
5436 goto error;
5437 }
5438 moreunits = repsize / 4;
5439 }
5440 else {
5441 assert(PyUnicode_Check(rep));
5442 if (PyUnicode_READY(rep) < 0)
5443 goto error;
5444 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5445 if (!PyUnicode_IS_ASCII(rep)) {
5446 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005447 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005448 "surrogates not allowed");
5449 goto error;
5450 }
5451 }
5452
5453 /* four bytes are reserved for each surrogate */
5454 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005455 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005456 Py_ssize_t morebytes = 4 * (moreunits - 1);
5457 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5458 /* integer overflow */
5459 PyErr_NoMemory();
5460 goto error;
5461 }
5462 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5463 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005464 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005465 }
5466
5467 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005468 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5469 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005470 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005471 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005472 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5473 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005474 }
5475
5476 Py_CLEAR(rep);
5477 }
5478
5479 /* Cut back to size actually needed. This is necessary for, for example,
5480 encoding of a string containing isolated surrogates and the 'ignore'
5481 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005482 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005483 if (nsize != PyBytes_GET_SIZE(v))
5484 _PyBytes_Resize(&v, nsize);
5485 Py_XDECREF(errorHandler);
5486 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005487 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005488 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005489 error:
5490 Py_XDECREF(rep);
5491 Py_XDECREF(errorHandler);
5492 Py_XDECREF(exc);
5493 Py_XDECREF(v);
5494 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005495}
5496
Alexander Belopolsky40018472011-02-26 01:02:56 +00005497PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005498PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5499 Py_ssize_t size,
5500 const char *errors,
5501 int byteorder)
5502{
5503 PyObject *result;
5504 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5505 if (tmp == NULL)
5506 return NULL;
5507 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5508 Py_DECREF(tmp);
5509 return result;
5510}
5511
5512PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005513PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005514{
Victor Stinnerb960b342011-11-20 19:12:52 +01005515 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005516}
5517
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518/* --- UTF-16 Codec ------------------------------------------------------- */
5519
Tim Peters772747b2001-08-09 22:21:55 +00005520PyObject *
5521PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005522 Py_ssize_t size,
5523 const char *errors,
5524 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525{
Walter Dörwald69652032004-09-07 20:24:22 +00005526 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5527}
5528
5529PyObject *
5530PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 Py_ssize_t size,
5532 const char *errors,
5533 int *byteorder,
5534 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005535{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005536 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005537 Py_ssize_t startinpos;
5538 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005539 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005540 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005541 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005542 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005543 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005544 PyObject *errorHandler = NULL;
5545 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005546 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547
Tim Peters772747b2001-08-09 22:21:55 +00005548 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005549 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550
5551 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005552 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005554 /* Check for BOM marks (U+FEFF) in the input and adjust current
5555 byte order setting accordingly. In native mode, the leading BOM
5556 mark is skipped, in all other modes, it is copied to the output
5557 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005558 if (bo == 0 && size >= 2) {
5559 const Py_UCS4 bom = (q[1] << 8) | q[0];
5560 if (bom == 0xFEFF) {
5561 q += 2;
5562 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005564 else if (bom == 0xFFFE) {
5565 q += 2;
5566 bo = 1;
5567 }
5568 if (byteorder)
5569 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571
Antoine Pitrou63065d72012-05-15 23:48:04 +02005572 if (q == e) {
5573 if (consumed)
5574 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005575 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005576 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005577
Christian Heimes743e0cd2012-10-17 23:52:17 +02005578#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005579 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005580 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005581#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005582 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005583 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005584#endif
Tim Peters772747b2001-08-09 22:21:55 +00005585
Antoine Pitrou63065d72012-05-15 23:48:04 +02005586 /* Note: size will always be longer than the resulting Unicode
5587 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005588 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005589 writer.min_length = (e - q + 1) / 2;
5590 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005591 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005592
Antoine Pitrou63065d72012-05-15 23:48:04 +02005593 while (1) {
5594 Py_UCS4 ch = 0;
5595 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005596 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005597 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005598 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005599 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005600 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005601 native_ordering);
5602 else
5603 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005604 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005605 native_ordering);
5606 } else if (kind == PyUnicode_2BYTE_KIND) {
5607 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005608 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005609 native_ordering);
5610 } else {
5611 assert(kind == PyUnicode_4BYTE_KIND);
5612 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005613 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005614 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005615 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005616 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005617
Antoine Pitrou63065d72012-05-15 23:48:04 +02005618 switch (ch)
5619 {
5620 case 0:
5621 /* remaining byte at the end? (size should be even) */
5622 if (q == e || consumed)
5623 goto End;
5624 errmsg = "truncated data";
5625 startinpos = ((const char *)q) - starts;
5626 endinpos = ((const char *)e) - starts;
5627 break;
5628 /* The remaining input chars are ignored if the callback
5629 chooses to skip the input */
5630 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005631 q -= 2;
5632 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005633 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005634 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005635 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005636 endinpos = ((const char *)e) - starts;
5637 break;
5638 case 2:
5639 errmsg = "illegal encoding";
5640 startinpos = ((const char *)q) - 2 - starts;
5641 endinpos = startinpos + 2;
5642 break;
5643 case 3:
5644 errmsg = "illegal UTF-16 surrogate";
5645 startinpos = ((const char *)q) - 4 - starts;
5646 endinpos = startinpos + 2;
5647 break;
5648 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005649 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005650 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005651 continue;
5652 }
5653
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005654 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005655 errors,
5656 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005657 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005658 &starts,
5659 (const char **)&e,
5660 &startinpos,
5661 &endinpos,
5662 &exc,
5663 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005664 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 }
5667
Antoine Pitrou63065d72012-05-15 23:48:04 +02005668End:
Walter Dörwald69652032004-09-07 20:24:22 +00005669 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005670 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005671
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005672 Py_XDECREF(errorHandler);
5673 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005674 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005677 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005678 Py_XDECREF(errorHandler);
5679 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 return NULL;
5681}
5682
Tim Peters772747b2001-08-09 22:21:55 +00005683PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005684_PyUnicode_EncodeUTF16(PyObject *str,
5685 const char *errors,
5686 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005688 enum PyUnicode_Kind kind;
5689 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005690 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005691 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005692 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005693 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005694#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005695 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005696#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005697 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005698#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005699 const char *encoding;
5700 Py_ssize_t nsize, pos;
5701 PyObject *errorHandler = NULL;
5702 PyObject *exc = NULL;
5703 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005704
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005705 if (!PyUnicode_Check(str)) {
5706 PyErr_BadArgument();
5707 return NULL;
5708 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005709 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005710 return NULL;
5711 kind = PyUnicode_KIND(str);
5712 data = PyUnicode_DATA(str);
5713 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005714
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005715 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005716 if (kind == PyUnicode_4BYTE_KIND) {
5717 const Py_UCS4 *in = (const Py_UCS4 *)data;
5718 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005719 while (in < end) {
5720 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005721 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005722 }
5723 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005724 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005725 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005727 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005728 nsize = len + pairs + (byteorder == 0);
5729 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005730 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005734 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005735 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005736 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005737 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005738 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005739 }
5740 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005741 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005742 }
Tim Peters772747b2001-08-09 22:21:55 +00005743
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005744 if (kind == PyUnicode_1BYTE_KIND) {
5745 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5746 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005747 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005748
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005749 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005750 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005751 }
5752 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005753 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005754 }
5755 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005756 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005757 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005758
5759 pos = 0;
5760 while (pos < len) {
5761 Py_ssize_t repsize, moreunits;
5762
5763 if (kind == PyUnicode_2BYTE_KIND) {
5764 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5765 &out, native_ordering);
5766 }
5767 else {
5768 assert(kind == PyUnicode_4BYTE_KIND);
5769 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5770 &out, native_ordering);
5771 }
5772 if (pos == len)
5773 break;
5774
5775 rep = unicode_encode_call_errorhandler(
5776 errors, &errorHandler,
5777 encoding, "surrogates not allowed",
5778 str, &exc, pos, pos + 1, &pos);
5779 if (!rep)
5780 goto error;
5781
5782 if (PyBytes_Check(rep)) {
5783 repsize = PyBytes_GET_SIZE(rep);
5784 if (repsize & 1) {
5785 raise_encode_exception(&exc, encoding,
5786 str, pos - 1, pos,
5787 "surrogates not allowed");
5788 goto error;
5789 }
5790 moreunits = repsize / 2;
5791 }
5792 else {
5793 assert(PyUnicode_Check(rep));
5794 if (PyUnicode_READY(rep) < 0)
5795 goto error;
5796 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5797 if (!PyUnicode_IS_ASCII(rep)) {
5798 raise_encode_exception(&exc, encoding,
5799 str, pos - 1, pos,
5800 "surrogates not allowed");
5801 goto error;
5802 }
5803 }
5804
5805 /* two bytes are reserved for each surrogate */
5806 if (moreunits > 1) {
5807 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5808 Py_ssize_t morebytes = 2 * (moreunits - 1);
5809 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5810 /* integer overflow */
5811 PyErr_NoMemory();
5812 goto error;
5813 }
5814 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5815 goto error;
5816 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5817 }
5818
5819 if (PyBytes_Check(rep)) {
5820 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5821 out += moreunits;
5822 } else /* rep is unicode */ {
5823 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5824 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5825 &out, native_ordering);
5826 }
5827
5828 Py_CLEAR(rep);
5829 }
5830
5831 /* Cut back to size actually needed. This is necessary for, for example,
5832 encoding of a string containing isolated surrogates and the 'ignore' handler
5833 is used. */
5834 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5835 if (nsize != PyBytes_GET_SIZE(v))
5836 _PyBytes_Resize(&v, nsize);
5837 Py_XDECREF(errorHandler);
5838 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005839 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005840 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005841 error:
5842 Py_XDECREF(rep);
5843 Py_XDECREF(errorHandler);
5844 Py_XDECREF(exc);
5845 Py_XDECREF(v);
5846 return NULL;
5847#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848}
5849
Alexander Belopolsky40018472011-02-26 01:02:56 +00005850PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005851PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5852 Py_ssize_t size,
5853 const char *errors,
5854 int byteorder)
5855{
5856 PyObject *result;
5857 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5858 if (tmp == NULL)
5859 return NULL;
5860 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5861 Py_DECREF(tmp);
5862 return result;
5863}
5864
5865PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005866PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005868 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869}
5870
5871/* --- Unicode Escape Codec ----------------------------------------------- */
5872
Fredrik Lundh06d12682001-01-24 07:59:11 +00005873static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005874
Alexander Belopolsky40018472011-02-26 01:02:56 +00005875PyObject *
5876PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005877 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005878 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005880 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005881 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005883 PyObject *errorHandler = NULL;
5884 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005885
Victor Stinner62ec3312016-09-06 17:04:34 -07005886 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005887 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005888 }
5889 /* Escaped strings will always be longer than the resulting
5890 Unicode string, so we start with size here and then reduce the
5891 length after conversion to the true value.
5892 (but if the error callback returns a long replacement string
5893 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005894 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005895 writer.min_length = size;
5896 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5897 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005898 }
5899
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 end = s + size;
5901 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005902 unsigned char c = (unsigned char) *s++;
5903 Py_UCS4 ch;
5904 int count;
5905 Py_ssize_t startinpos;
5906 Py_ssize_t endinpos;
5907 const char *message;
5908
5909#define WRITE_ASCII_CHAR(ch) \
5910 do { \
5911 assert(ch <= 127); \
5912 assert(writer.pos < writer.size); \
5913 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5914 } while(0)
5915
5916#define WRITE_CHAR(ch) \
5917 do { \
5918 if (ch <= writer.maxchar) { \
5919 assert(writer.pos < writer.size); \
5920 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5921 } \
5922 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5923 goto onError; \
5924 } \
5925 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926
5927 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005928 if (c != '\\') {
5929 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930 continue;
5931 }
5932
Victor Stinner62ec3312016-09-06 17:04:34 -07005933 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005935 if (s >= end) {
5936 message = "\\ at end of string";
5937 goto error;
5938 }
5939 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005940
Victor Stinner62ec3312016-09-06 17:04:34 -07005941 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005942 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005945 case '\n': continue;
5946 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5947 case '\'': WRITE_ASCII_CHAR('\''); continue;
5948 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5949 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005950 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005951 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5952 case 't': WRITE_ASCII_CHAR('\t'); continue;
5953 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5954 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005955 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005956 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005957 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005958 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 case '0': case '1': case '2': case '3':
5962 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005963 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005964 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005965 ch = (ch<<3) + *s++ - '0';
5966 if (s < end && '0' <= *s && *s <= '7') {
5967 ch = (ch<<3) + *s++ - '0';
5968 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005970 WRITE_CHAR(ch);
5971 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 /* hex escapes */
5974 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07005976 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005977 message = "truncated \\xXX escape";
5978 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07005982 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005983 message = "truncated \\uXXXX escape";
5984 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005987 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07005988 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005989 message = "truncated \\UXXXXXXXX escape";
5990 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07005991 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02005992 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07005993 ch <<= 4;
5994 if (c >= '0' && c <= '9') {
5995 ch += c - '0';
5996 }
5997 else if (c >= 'a' && c <= 'f') {
5998 ch += c - ('a' - 10);
5999 }
6000 else if (c >= 'A' && c <= 'F') {
6001 ch += c - ('A' - 10);
6002 }
6003 else {
6004 break;
6005 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006006 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006007 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006008 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006009 }
6010
6011 /* when we get here, ch is a 32-bit unicode character */
6012 if (ch > MAX_UNICODE) {
6013 message = "illegal Unicode character";
6014 goto error;
6015 }
6016
6017 WRITE_CHAR(ch);
6018 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006019
Benjamin Peterson29060642009-01-31 22:14:21 +00006020 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006021 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006022 if (ucnhash_CAPI == NULL) {
6023 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006024 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6025 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006026 if (ucnhash_CAPI == NULL) {
6027 PyErr_SetString(
6028 PyExc_UnicodeError,
6029 "\\N escapes not supported (can't load unicodedata module)"
6030 );
6031 goto onError;
6032 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006033 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006034
6035 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006036 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006037 const char *start = ++s;
6038 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006039 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006040 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006041 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006042 namelen = s - start;
6043 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006044 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006045 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006046 ch = 0xffffffff; /* in case 'getcode' messes up */
6047 if (namelen <= INT_MAX &&
6048 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6049 &ch, 0)) {
6050 assert(ch <= MAX_UNICODE);
6051 WRITE_CHAR(ch);
6052 continue;
6053 }
6054 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006055 }
6056 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006057 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006058
6059 default:
Victor Stinner62ec3312016-09-06 17:04:34 -07006060 WRITE_ASCII_CHAR('\\');
6061 WRITE_CHAR(c);
6062 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006064
6065 error:
6066 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006067 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006068 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006069 errors, &errorHandler,
6070 "unicodeescape", message,
6071 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006072 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006073 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006074 }
6075 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6076 goto onError;
6077 }
6078
6079#undef WRITE_ASCII_CHAR
6080#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006082
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006083 Py_XDECREF(errorHandler);
6084 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006085 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006086
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006088 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006089 Py_XDECREF(errorHandler);
6090 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 return NULL;
6092}
6093
6094/* Return a Unicode-Escape string version of the Unicode object.
6095
6096 If quotes is true, the string is enclosed in u"" or u'' quotes as
6097 appropriate.
6098
6099*/
6100
Alexander Belopolsky40018472011-02-26 01:02:56 +00006101PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006102PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006104 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006105 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006107 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006108 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006109 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110
Ezio Melottie7f90372012-10-05 03:33:31 +03006111 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006112 escape.
6113
Ezio Melottie7f90372012-10-05 03:33:31 +03006114 For UCS1 strings it's '\xxx', 4 bytes per source character.
6115 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6116 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006117 */
6118
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006119 if (!PyUnicode_Check(unicode)) {
6120 PyErr_BadArgument();
6121 return NULL;
6122 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006123 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006124 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006125 }
Victor Stinner358af132015-10-12 22:36:57 +02006126
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006127 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006128 if (len == 0) {
6129 return PyBytes_FromStringAndSize(NULL, 0);
6130 }
6131
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006132 kind = PyUnicode_KIND(unicode);
6133 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006134 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6135 bytes, and 1 byte characters 4. */
6136 expandsize = kind * 2 + 2;
6137 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) {
6138 return PyErr_NoMemory();
6139 }
6140 repr = PyBytes_FromStringAndSize(NULL, 2 + expandsize * len + 1);
6141 if (repr == NULL) {
6142 return NULL;
6143 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006144
Victor Stinner62ec3312016-09-06 17:04:34 -07006145 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006146 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006147 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006148
Victor Stinner62ec3312016-09-06 17:04:34 -07006149 /* U+0000-U+00ff range */
6150 if (ch < 0x100) {
6151 if (ch >= ' ' && ch < 127) {
6152 if (ch != '\\') {
6153 /* Copy printable US ASCII as-is */
6154 *p++ = (char) ch;
6155 }
6156 /* Escape backslashes */
6157 else {
6158 *p++ = '\\';
6159 *p++ = '\\';
6160 }
6161 }
Victor Stinner358af132015-10-12 22:36:57 +02006162
Victor Stinner62ec3312016-09-06 17:04:34 -07006163 /* Map special whitespace to '\t', \n', '\r' */
6164 else if (ch == '\t') {
6165 *p++ = '\\';
6166 *p++ = 't';
6167 }
6168 else if (ch == '\n') {
6169 *p++ = '\\';
6170 *p++ = 'n';
6171 }
6172 else if (ch == '\r') {
6173 *p++ = '\\';
6174 *p++ = 'r';
6175 }
6176
6177 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6178 else {
6179 *p++ = '\\';
6180 *p++ = 'x';
6181 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6182 *p++ = Py_hexdigits[ch & 0x000F];
6183 }
Tim Petersced69f82003-09-16 20:30:58 +00006184 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006185 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6186 else if (ch < 0x10000) {
6187 /* U+0100-U+ffff */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 *p++ = '\\';
6189 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006190 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6191 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6192 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6193 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006195 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6196 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006197
Victor Stinner62ec3312016-09-06 17:04:34 -07006198 /* Make sure that the first two digits are zero */
6199 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006200 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006201 *p++ = 'U';
6202 *p++ = '0';
6203 *p++ = '0';
6204 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6205 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6206 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6207 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6208 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6209 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212
Victor Stinner62ec3312016-09-06 17:04:34 -07006213 assert(p - PyBytes_AS_STRING(repr) > 0);
6214 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6215 return NULL;
6216 }
6217 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218}
6219
Alexander Belopolsky40018472011-02-26 01:02:56 +00006220PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006221PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6222 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006224 PyObject *result;
6225 PyObject *tmp = PyUnicode_FromUnicode(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006226 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006228 }
6229
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006230 result = PyUnicode_AsUnicodeEscapeString(tmp);
6231 Py_DECREF(tmp);
6232 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233}
6234
6235/* --- Raw Unicode Escape Codec ------------------------------------------- */
6236
Alexander Belopolsky40018472011-02-26 01:02:56 +00006237PyObject *
6238PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006239 Py_ssize_t size,
6240 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006242 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006243 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006245 PyObject *errorHandler = NULL;
6246 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006247
Victor Stinner62ec3312016-09-06 17:04:34 -07006248 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006249 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006250 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006251
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 /* Escaped strings will always be longer than the resulting
6253 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006254 length after conversion to the true value. (But decoding error
6255 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006256 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006257 writer.min_length = size;
6258 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6259 goto onError;
6260 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006261
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 end = s + size;
6263 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006264 unsigned char c = (unsigned char) *s++;
6265 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006266 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006267 Py_ssize_t startinpos;
6268 Py_ssize_t endinpos;
6269 const char *message;
6270
6271#define WRITE_CHAR(ch) \
6272 do { \
6273 if (ch <= writer.maxchar) { \
6274 assert(writer.pos < writer.size); \
6275 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6276 } \
6277 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6278 goto onError; \
6279 } \
6280 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006283 if (c != '\\' || s >= end) {
6284 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006286 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006287
Victor Stinner62ec3312016-09-06 17:04:34 -07006288 c = (unsigned char) *s++;
6289 if (c == 'u') {
6290 count = 4;
6291 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006293 else if (c == 'U') {
6294 count = 8;
6295 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006296 }
6297 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006298 assert(writer.pos < writer.size);
6299 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6300 WRITE_CHAR(c);
6301 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006302 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006303 startinpos = s - starts - 2;
6304
6305 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6306 for (ch = 0; count && s < end; ++s, --count) {
6307 c = (unsigned char)*s;
6308 ch <<= 4;
6309 if (c >= '0' && c <= '9') {
6310 ch += c - '0';
6311 }
6312 else if (c >= 'a' && c <= 'f') {
6313 ch += c - ('a' - 10);
6314 }
6315 else if (c >= 'A' && c <= 'F') {
6316 ch += c - ('A' - 10);
6317 }
6318 else {
6319 break;
6320 }
6321 }
6322 if (!count) {
6323 if (ch <= MAX_UNICODE) {
6324 WRITE_CHAR(ch);
6325 continue;
6326 }
6327 message = "\\Uxxxxxxxx out of range";
6328 }
6329
6330 endinpos = s-starts;
6331 writer.min_length = end - s + writer.pos;
6332 if (unicode_decode_call_errorhandler_writer(
6333 errors, &errorHandler,
6334 "rawunicodeescape", message,
6335 &starts, &end, &startinpos, &endinpos, &exc, &s,
6336 &writer)) {
6337 goto onError;
6338 }
6339 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6340 goto onError;
6341 }
6342
6343#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345 Py_XDECREF(errorHandler);
6346 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006347 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006348
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006350 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006351 Py_XDECREF(errorHandler);
6352 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006354
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355}
6356
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006357
Alexander Belopolsky40018472011-02-26 01:02:56 +00006358PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006359PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360{
Victor Stinner62ec3312016-09-06 17:04:34 -07006361 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006363 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006364 int kind;
6365 void *data;
6366 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006368 if (!PyUnicode_Check(unicode)) {
6369 PyErr_BadArgument();
6370 return NULL;
6371 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006372 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006373 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006374 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006375 kind = PyUnicode_KIND(unicode);
6376 data = PyUnicode_DATA(unicode);
6377 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006378 if (kind == PyUnicode_1BYTE_KIND) {
6379 return PyBytes_FromStringAndSize(data, len);
6380 }
Victor Stinner0e368262011-11-10 20:12:49 +01006381
Victor Stinner62ec3312016-09-06 17:04:34 -07006382 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6383 bytes, and 1 byte characters 4. */
6384 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006385
Victor Stinner62ec3312016-09-06 17:04:34 -07006386 if (len > PY_SSIZE_T_MAX / expandsize) {
6387 return PyErr_NoMemory();
6388 }
6389 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6390 if (repr == NULL) {
6391 return NULL;
6392 }
6393 if (len == 0) {
6394 return repr;
6395 }
6396
6397 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006398 for (pos = 0; pos < len; pos++) {
6399 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006400
Victor Stinner62ec3312016-09-06 17:04:34 -07006401 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6402 if (ch < 0x100) {
6403 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006404 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006405 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6406 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407 *p++ = '\\';
6408 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006409 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6410 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6411 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6412 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006414 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6415 else {
6416 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6417 *p++ = '\\';
6418 *p++ = 'U';
6419 *p++ = '0';
6420 *p++ = '0';
6421 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6422 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6423 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6424 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6425 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6426 *p++ = Py_hexdigits[ch & 15];
6427 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006429
Victor Stinner62ec3312016-09-06 17:04:34 -07006430 assert(p > PyBytes_AS_STRING(repr));
6431 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6432 return NULL;
6433 }
6434 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435}
6436
Alexander Belopolsky40018472011-02-26 01:02:56 +00006437PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006438PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6439 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006441 PyObject *result;
6442 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6443 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006444 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006445 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6446 Py_DECREF(tmp);
6447 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448}
6449
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006450/* --- Unicode Internal Codec ------------------------------------------- */
6451
Alexander Belopolsky40018472011-02-26 01:02:56 +00006452PyObject *
6453_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006454 Py_ssize_t size,
6455 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006456{
6457 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006458 Py_ssize_t startinpos;
6459 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006460 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006461 const char *end;
6462 const char *reason;
6463 PyObject *errorHandler = NULL;
6464 PyObject *exc = NULL;
6465
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006466 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006467 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006468 1))
6469 return NULL;
6470
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006471 if (size == 0)
6472 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006473
Victor Stinner8f674cc2013-04-17 23:02:17 +02006474 _PyUnicodeWriter_Init(&writer);
6475 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6476 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006478 }
6479 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006480
Victor Stinner8f674cc2013-04-17 23:02:17 +02006481 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006482 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006483 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006484 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006485 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006486 endinpos = end-starts;
6487 reason = "truncated input";
6488 goto error;
6489 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006490 /* We copy the raw representation one byte at a time because the
6491 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006492 ((char *) &uch)[0] = s[0];
6493 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006494#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006495 ((char *) &uch)[2] = s[2];
6496 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006497#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006498 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006499#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006500 /* We have to sanity check the raw data, otherwise doom looms for
6501 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006502 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006503 endinpos = s - starts + Py_UNICODE_SIZE;
6504 reason = "illegal code point (> 0x10FFFF)";
6505 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006506 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006507#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006508 s += Py_UNICODE_SIZE;
6509#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006510 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006511 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006512 Py_UNICODE uch2;
6513 ((char *) &uch2)[0] = s[0];
6514 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006515 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006516 {
Victor Stinner551ac952011-11-29 22:58:13 +01006517 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006518 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006519 }
6520 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006521#endif
6522
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006523 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006524 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006525 continue;
6526
6527 error:
6528 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006529 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006530 errors, &errorHandler,
6531 "unicode_internal", reason,
6532 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006533 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006534 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006535 }
6536
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006537 Py_XDECREF(errorHandler);
6538 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006539 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006540
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006542 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006543 Py_XDECREF(errorHandler);
6544 Py_XDECREF(exc);
6545 return NULL;
6546}
6547
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548/* --- Latin-1 Codec ------------------------------------------------------ */
6549
Alexander Belopolsky40018472011-02-26 01:02:56 +00006550PyObject *
6551PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006552 Py_ssize_t size,
6553 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006556 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557}
6558
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006559/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006560static void
6561make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006562 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006563 PyObject *unicode,
6564 Py_ssize_t startpos, Py_ssize_t endpos,
6565 const char *reason)
6566{
6567 if (*exceptionObject == NULL) {
6568 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006569 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006570 encoding, unicode, startpos, endpos, reason);
6571 }
6572 else {
6573 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6574 goto onError;
6575 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6576 goto onError;
6577 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6578 goto onError;
6579 return;
6580 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006581 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006582 }
6583}
6584
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006585/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006586static void
6587raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006588 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006589 PyObject *unicode,
6590 Py_ssize_t startpos, Py_ssize_t endpos,
6591 const char *reason)
6592{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006593 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006594 encoding, unicode, startpos, endpos, reason);
6595 if (*exceptionObject != NULL)
6596 PyCodec_StrictErrors(*exceptionObject);
6597}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006598
6599/* error handling callback helper:
6600 build arguments, call the callback and check the arguments,
6601 put the result into newpos and return the replacement string, which
6602 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006603static PyObject *
6604unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006605 PyObject **errorHandler,
6606 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006607 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006608 Py_ssize_t startpos, Py_ssize_t endpos,
6609 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006610{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006611 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006612 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006613 PyObject *restuple;
6614 PyObject *resunicode;
6615
6616 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006617 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006618 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006620 }
6621
Benjamin Petersonbac79492012-01-14 13:34:47 -05006622 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006623 return NULL;
6624 len = PyUnicode_GET_LENGTH(unicode);
6625
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006626 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006627 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006628 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006629 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006630
6631 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006632 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006633 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006635 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006636 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006637 Py_DECREF(restuple);
6638 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006639 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006640 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 &resunicode, newpos)) {
6642 Py_DECREF(restuple);
6643 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006645 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6646 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6647 Py_DECREF(restuple);
6648 return NULL;
6649 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006650 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006651 *newpos = len + *newpos;
6652 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006653 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 Py_DECREF(restuple);
6655 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006656 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006657 Py_INCREF(resunicode);
6658 Py_DECREF(restuple);
6659 return resunicode;
6660}
6661
Alexander Belopolsky40018472011-02-26 01:02:56 +00006662static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006663unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006664 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006665 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006666{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006667 /* input state */
6668 Py_ssize_t pos=0, size;
6669 int kind;
6670 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006671 /* pointer into the output */
6672 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006673 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6674 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006675 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006677 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006678 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006679 /* output object */
6680 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006681
Benjamin Petersonbac79492012-01-14 13:34:47 -05006682 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006683 return NULL;
6684 size = PyUnicode_GET_LENGTH(unicode);
6685 kind = PyUnicode_KIND(unicode);
6686 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687 /* allocate enough for a simple encoding without
6688 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006689 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006690 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006691
6692 _PyBytesWriter_Init(&writer);
6693 str = _PyBytesWriter_Alloc(&writer, size);
6694 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006695 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006696
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006697 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006698 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006699
Benjamin Peterson29060642009-01-31 22:14:21 +00006700 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006701 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006702 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006703 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006704 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006705 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006707 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006709 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006710 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006712
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006713 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006714 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006715
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006716 /* Only overallocate the buffer if it's not the last write */
6717 writer.overallocate = (collend < size);
6718
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006720 if (error_handler == _Py_ERROR_UNKNOWN)
6721 error_handler = get_error_handler(errors);
6722
6723 switch (error_handler) {
6724 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006725 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006727
6728 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006729 memset(str, '?', collend - collstart);
6730 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006731 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006732 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006733 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 break;
Victor Stinner50149202015-09-22 00:26:54 +02006735
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006736 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006737 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006738 writer.min_size -= (collend - collstart);
6739 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006740 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006741 if (str == NULL)
6742 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006743 pos = collend;
6744 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006745
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006746 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006747 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006748 writer.min_size -= (collend - collstart);
6749 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006750 unicode, collstart, collend);
6751 if (str == NULL)
6752 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006753 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 break;
Victor Stinner50149202015-09-22 00:26:54 +02006755
Victor Stinnerc3713e92015-09-29 12:32:13 +02006756 case _Py_ERROR_SURROGATEESCAPE:
6757 for (i = collstart; i < collend; ++i) {
6758 ch = PyUnicode_READ(kind, data, i);
6759 if (ch < 0xdc80 || 0xdcff < ch) {
6760 /* Not a UTF-8b surrogate */
6761 break;
6762 }
6763 *str++ = (char)(ch - 0xdc00);
6764 ++pos;
6765 }
6766 if (i >= collend)
6767 break;
6768 collstart = pos;
6769 assert(collstart != collend);
6770 /* fallback to general error handling */
6771
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006773 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6774 encoding, reason, unicode, &exc,
6775 collstart, collend, &newpos);
6776 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006778
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006779 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006780 writer.min_size -= 1;
6781
Victor Stinner6bd525b2015-10-09 13:10:05 +02006782 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006783 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006784 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006785 PyBytes_AS_STRING(rep),
6786 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006787 if (str == NULL)
6788 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006789 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006790 else {
6791 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006792
Victor Stinner6bd525b2015-10-09 13:10:05 +02006793 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006794 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006795
6796 if (PyUnicode_IS_ASCII(rep)) {
6797 /* Fast path: all characters are smaller than limit */
6798 assert(limit >= 128);
6799 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6800 str = _PyBytesWriter_WriteBytes(&writer, str,
6801 PyUnicode_DATA(rep),
6802 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006803 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006804 else {
6805 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6806
6807 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6808 if (str == NULL)
6809 goto onError;
6810
6811 /* check if there is anything unencodable in the
6812 replacement and copy it to the output */
6813 for (i = 0; repsize-->0; ++i, ++str) {
6814 ch = PyUnicode_READ_CHAR(rep, i);
6815 if (ch >= limit) {
6816 raise_encode_exception(&exc, encoding, unicode,
6817 pos, pos+1, reason);
6818 goto onError;
6819 }
6820 *str = (char)ch;
6821 }
6822 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006824 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006825 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006826 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006827
6828 /* If overallocation was disabled, ensure that it was the last
6829 write. Otherwise, we missed an optimization */
6830 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006831 }
6832 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006833
Victor Stinner50149202015-09-22 00:26:54 +02006834 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006835 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006836 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006837
6838 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006839 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006840 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006841 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006842 Py_XDECREF(exc);
6843 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006844}
6845
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006846/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006847PyObject *
6848PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006849 Py_ssize_t size,
6850 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006852 PyObject *result;
6853 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6854 if (unicode == NULL)
6855 return NULL;
6856 result = unicode_encode_ucs1(unicode, errors, 256);
6857 Py_DECREF(unicode);
6858 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859}
6860
Alexander Belopolsky40018472011-02-26 01:02:56 +00006861PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006862_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863{
6864 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 PyErr_BadArgument();
6866 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006868 if (PyUnicode_READY(unicode) == -1)
6869 return NULL;
6870 /* Fast path: if it is a one-byte string, construct
6871 bytes object directly. */
6872 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6873 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6874 PyUnicode_GET_LENGTH(unicode));
6875 /* Non-Latin-1 characters present. Defer to above function to
6876 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006877 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006878}
6879
6880PyObject*
6881PyUnicode_AsLatin1String(PyObject *unicode)
6882{
6883 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884}
6885
6886/* --- 7-bit ASCII Codec -------------------------------------------------- */
6887
Alexander Belopolsky40018472011-02-26 01:02:56 +00006888PyObject *
6889PyUnicode_DecodeASCII(const char *s,
6890 Py_ssize_t size,
6891 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006893 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006894 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006895 int kind;
6896 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006897 Py_ssize_t startinpos;
6898 Py_ssize_t endinpos;
6899 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006900 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006901 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006902 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006903 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006904
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006906 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006907
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006909 if (size == 1 && (unsigned char)s[0] < 128)
6910 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006911
Victor Stinner8f674cc2013-04-17 23:02:17 +02006912 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006913 writer.min_length = size;
6914 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006915 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006916
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006917 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006918 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006919 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006920 writer.pos = outpos;
6921 if (writer.pos == size)
6922 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006923
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006924 s += writer.pos;
6925 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006926 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006927 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006929 PyUnicode_WRITE(kind, data, writer.pos, c);
6930 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006931 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006932 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006934
6935 /* byte outsize range 0x00..0x7f: call the error handler */
6936
6937 if (error_handler == _Py_ERROR_UNKNOWN)
6938 error_handler = get_error_handler(errors);
6939
6940 switch (error_handler)
6941 {
6942 case _Py_ERROR_REPLACE:
6943 case _Py_ERROR_SURROGATEESCAPE:
6944 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006945 but we may switch to UCS2 at the first write */
6946 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6947 goto onError;
6948 kind = writer.kind;
6949 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006950
6951 if (error_handler == _Py_ERROR_REPLACE)
6952 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6953 else
6954 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6955 writer.pos++;
6956 ++s;
6957 break;
6958
6959 case _Py_ERROR_IGNORE:
6960 ++s;
6961 break;
6962
6963 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006964 startinpos = s-starts;
6965 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006966 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006967 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 "ascii", "ordinal not in range(128)",
6969 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006970 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006972 kind = writer.kind;
6973 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006976 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006977 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006978 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006979
Benjamin Peterson29060642009-01-31 22:14:21 +00006980 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006981 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006982 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006983 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984 return NULL;
6985}
6986
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006987/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006988PyObject *
6989PyUnicode_EncodeASCII(const Py_UNICODE *p,
6990 Py_ssize_t size,
6991 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006993 PyObject *result;
6994 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6995 if (unicode == NULL)
6996 return NULL;
6997 result = unicode_encode_ucs1(unicode, errors, 128);
6998 Py_DECREF(unicode);
6999 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000}
7001
Alexander Belopolsky40018472011-02-26 01:02:56 +00007002PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007003_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004{
7005 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 PyErr_BadArgument();
7007 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007009 if (PyUnicode_READY(unicode) == -1)
7010 return NULL;
7011 /* Fast path: if it is an ASCII-only string, construct bytes object
7012 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007013 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007014 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7015 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007016 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007017}
7018
7019PyObject *
7020PyUnicode_AsASCIIString(PyObject *unicode)
7021{
7022 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023}
7024
Victor Stinner99b95382011-07-04 14:23:54 +02007025#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007026
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007027/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007028
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007029#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007030#define NEED_RETRY
7031#endif
7032
Victor Stinner3a50e702011-10-18 21:21:00 +02007033#ifndef WC_ERR_INVALID_CHARS
7034# define WC_ERR_INVALID_CHARS 0x0080
7035#endif
7036
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007037static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007038code_page_name(UINT code_page, PyObject **obj)
7039{
7040 *obj = NULL;
7041 if (code_page == CP_ACP)
7042 return "mbcs";
7043 if (code_page == CP_UTF7)
7044 return "CP_UTF7";
7045 if (code_page == CP_UTF8)
7046 return "CP_UTF8";
7047
7048 *obj = PyBytes_FromFormat("cp%u", code_page);
7049 if (*obj == NULL)
7050 return NULL;
7051 return PyBytes_AS_STRING(*obj);
7052}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007053
Victor Stinner3a50e702011-10-18 21:21:00 +02007054static DWORD
7055decode_code_page_flags(UINT code_page)
7056{
7057 if (code_page == CP_UTF7) {
7058 /* The CP_UTF7 decoder only supports flags=0 */
7059 return 0;
7060 }
7061 else
7062 return MB_ERR_INVALID_CHARS;
7063}
7064
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007065/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007066 * Decode a byte string from a Windows code page into unicode object in strict
7067 * mode.
7068 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007069 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7070 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007071 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007072static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007073decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007074 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007075 const char *in,
7076 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007077{
Victor Stinner3a50e702011-10-18 21:21:00 +02007078 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007079 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007080 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007081
7082 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007083 assert(insize > 0);
7084 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7085 if (outsize <= 0)
7086 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007087
7088 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007089 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007090 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007091 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 if (*v == NULL)
7093 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007094 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007095 }
7096 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007097 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007098 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007099 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007101 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007102 }
7103
7104 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007105 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7106 if (outsize <= 0)
7107 goto error;
7108 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007109
Victor Stinner3a50e702011-10-18 21:21:00 +02007110error:
7111 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7112 return -2;
7113 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007114 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007115}
7116
Victor Stinner3a50e702011-10-18 21:21:00 +02007117/*
7118 * Decode a byte string from a code page into unicode object with an error
7119 * handler.
7120 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007121 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007122 * UnicodeDecodeError exception and returns -1 on error.
7123 */
7124static int
7125decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007126 PyObject **v,
7127 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007128 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007129{
7130 const char *startin = in;
7131 const char *endin = in + size;
7132 const DWORD flags = decode_code_page_flags(code_page);
7133 /* Ideally, we should get reason from FormatMessage. This is the Windows
7134 2000 English version of the message. */
7135 const char *reason = "No mapping for the Unicode character exists "
7136 "in the target code page.";
7137 /* each step cannot decode more than 1 character, but a character can be
7138 represented as a surrogate pair */
7139 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007140 int insize;
7141 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007142 PyObject *errorHandler = NULL;
7143 PyObject *exc = NULL;
7144 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007145 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007146 DWORD err;
7147 int ret = -1;
7148
7149 assert(size > 0);
7150
7151 encoding = code_page_name(code_page, &encoding_obj);
7152 if (encoding == NULL)
7153 return -1;
7154
Victor Stinner7d00cc12014-03-17 23:08:06 +01007155 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007156 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7157 UnicodeDecodeError. */
7158 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7159 if (exc != NULL) {
7160 PyCodec_StrictErrors(exc);
7161 Py_CLEAR(exc);
7162 }
7163 goto error;
7164 }
7165
7166 if (*v == NULL) {
7167 /* Create unicode object */
7168 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7169 PyErr_NoMemory();
7170 goto error;
7171 }
Victor Stinnerab595942011-12-17 04:59:06 +01007172 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007173 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 if (*v == NULL)
7175 goto error;
7176 startout = PyUnicode_AS_UNICODE(*v);
7177 }
7178 else {
7179 /* Extend unicode object */
7180 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7181 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7182 PyErr_NoMemory();
7183 goto error;
7184 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007185 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007186 goto error;
7187 startout = PyUnicode_AS_UNICODE(*v) + n;
7188 }
7189
7190 /* Decode the byte string character per character */
7191 out = startout;
7192 while (in < endin)
7193 {
7194 /* Decode a character */
7195 insize = 1;
7196 do
7197 {
7198 outsize = MultiByteToWideChar(code_page, flags,
7199 in, insize,
7200 buffer, Py_ARRAY_LENGTH(buffer));
7201 if (outsize > 0)
7202 break;
7203 err = GetLastError();
7204 if (err != ERROR_NO_UNICODE_TRANSLATION
7205 && err != ERROR_INSUFFICIENT_BUFFER)
7206 {
7207 PyErr_SetFromWindowsErr(0);
7208 goto error;
7209 }
7210 insize++;
7211 }
7212 /* 4=maximum length of a UTF-8 sequence */
7213 while (insize <= 4 && (in + insize) <= endin);
7214
7215 if (outsize <= 0) {
7216 Py_ssize_t startinpos, endinpos, outpos;
7217
Victor Stinner7d00cc12014-03-17 23:08:06 +01007218 /* last character in partial decode? */
7219 if (in + insize >= endin && !final)
7220 break;
7221
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 startinpos = in - startin;
7223 endinpos = startinpos + 1;
7224 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007225 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007226 errors, &errorHandler,
7227 encoding, reason,
7228 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007229 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 {
7231 goto error;
7232 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007233 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 }
7235 else {
7236 in += insize;
7237 memcpy(out, buffer, outsize * sizeof(wchar_t));
7238 out += outsize;
7239 }
7240 }
7241
7242 /* write a NUL character at the end */
7243 *out = 0;
7244
7245 /* Extend unicode object */
7246 outsize = out - startout;
7247 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007248 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007249 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007250 /* (in - startin) <= size and size is an int */
7251 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007252
7253error:
7254 Py_XDECREF(encoding_obj);
7255 Py_XDECREF(errorHandler);
7256 Py_XDECREF(exc);
7257 return ret;
7258}
7259
Victor Stinner3a50e702011-10-18 21:21:00 +02007260static PyObject *
7261decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007262 const char *s, Py_ssize_t size,
7263 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007264{
Victor Stinner76a31a62011-11-04 00:05:13 +01007265 PyObject *v = NULL;
7266 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007267
Victor Stinner3a50e702011-10-18 21:21:00 +02007268 if (code_page < 0) {
7269 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7270 return NULL;
7271 }
7272
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007273 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007274 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007275
Victor Stinner76a31a62011-11-04 00:05:13 +01007276 do
7277 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007278#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007279 if (size > INT_MAX) {
7280 chunk_size = INT_MAX;
7281 final = 0;
7282 done = 0;
7283 }
7284 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007285#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007286 {
7287 chunk_size = (int)size;
7288 final = (consumed == NULL);
7289 done = 1;
7290 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007291
Victor Stinner76a31a62011-11-04 00:05:13 +01007292 if (chunk_size == 0 && done) {
7293 if (v != NULL)
7294 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007295 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007296 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007297
Victor Stinner76a31a62011-11-04 00:05:13 +01007298 converted = decode_code_page_strict(code_page, &v,
7299 s, chunk_size);
7300 if (converted == -2)
7301 converted = decode_code_page_errors(code_page, &v,
7302 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007303 errors, final);
7304 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007305
7306 if (converted < 0) {
7307 Py_XDECREF(v);
7308 return NULL;
7309 }
7310
7311 if (consumed)
7312 *consumed += converted;
7313
7314 s += converted;
7315 size -= converted;
7316 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007317
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007318 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007319}
7320
Alexander Belopolsky40018472011-02-26 01:02:56 +00007321PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007322PyUnicode_DecodeCodePageStateful(int code_page,
7323 const char *s,
7324 Py_ssize_t size,
7325 const char *errors,
7326 Py_ssize_t *consumed)
7327{
7328 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7329}
7330
7331PyObject *
7332PyUnicode_DecodeMBCSStateful(const char *s,
7333 Py_ssize_t size,
7334 const char *errors,
7335 Py_ssize_t *consumed)
7336{
7337 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7338}
7339
7340PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007341PyUnicode_DecodeMBCS(const char *s,
7342 Py_ssize_t size,
7343 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007344{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007345 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7346}
7347
Victor Stinner3a50e702011-10-18 21:21:00 +02007348static DWORD
7349encode_code_page_flags(UINT code_page, const char *errors)
7350{
7351 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007352 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007353 }
7354 else if (code_page == CP_UTF7) {
7355 /* CP_UTF7 only supports flags=0 */
7356 return 0;
7357 }
7358 else {
7359 if (errors != NULL && strcmp(errors, "replace") == 0)
7360 return 0;
7361 else
7362 return WC_NO_BEST_FIT_CHARS;
7363 }
7364}
7365
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007366/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007367 * Encode a Unicode string to a Windows code page into a byte string in strict
7368 * mode.
7369 *
7370 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007371 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007372 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007373static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007374encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007375 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007377{
Victor Stinner554f3f02010-06-16 23:33:54 +00007378 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007379 BOOL *pusedDefaultChar = &usedDefaultChar;
7380 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007381 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007382 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007383 const DWORD flags = encode_code_page_flags(code_page, NULL);
7384 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007385 /* Create a substring so that we can get the UTF-16 representation
7386 of just the slice under consideration. */
7387 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007388
Martin v. Löwis3d325192011-11-04 18:23:06 +01007389 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007390
Victor Stinner3a50e702011-10-18 21:21:00 +02007391 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007392 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007393 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007394 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007395
Victor Stinner2fc507f2011-11-04 20:06:39 +01007396 substring = PyUnicode_Substring(unicode, offset, offset+len);
7397 if (substring == NULL)
7398 return -1;
7399 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7400 if (p == NULL) {
7401 Py_DECREF(substring);
7402 return -1;
7403 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007404 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007405
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007406 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007407 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007408 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007409 NULL, 0,
7410 NULL, pusedDefaultChar);
7411 if (outsize <= 0)
7412 goto error;
7413 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007414 if (pusedDefaultChar && *pusedDefaultChar) {
7415 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007416 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007417 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007418
Victor Stinner3a50e702011-10-18 21:21:00 +02007419 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007421 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007422 if (*outbytes == NULL) {
7423 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007424 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007425 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007427 }
7428 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007429 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007430 const Py_ssize_t n = PyBytes_Size(*outbytes);
7431 if (outsize > PY_SSIZE_T_MAX - n) {
7432 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007433 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007434 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007435 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007436 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7437 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007439 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007440 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007441 }
7442
7443 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007444 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007445 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007446 out, outsize,
7447 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007448 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 if (outsize <= 0)
7450 goto error;
7451 if (pusedDefaultChar && *pusedDefaultChar)
7452 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007453 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007454
Victor Stinner3a50e702011-10-18 21:21:00 +02007455error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007456 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7458 return -2;
7459 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007460 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007461}
7462
Victor Stinner3a50e702011-10-18 21:21:00 +02007463/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007464 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007465 * error handler.
7466 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007467 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007468 * -1 on other error.
7469 */
7470static int
7471encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007472 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007473 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007474{
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007476 Py_ssize_t pos = unicode_offset;
7477 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 /* Ideally, we should get reason from FormatMessage. This is the Windows
7479 2000 English version of the message. */
7480 const char *reason = "invalid character";
7481 /* 4=maximum length of a UTF-8 sequence */
7482 char buffer[4];
7483 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7484 Py_ssize_t outsize;
7485 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007486 PyObject *errorHandler = NULL;
7487 PyObject *exc = NULL;
7488 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007489 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007490 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 PyObject *rep;
7492 int ret = -1;
7493
7494 assert(insize > 0);
7495
7496 encoding = code_page_name(code_page, &encoding_obj);
7497 if (encoding == NULL)
7498 return -1;
7499
7500 if (errors == NULL || strcmp(errors, "strict") == 0) {
7501 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7502 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007503 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007504 if (exc != NULL) {
7505 PyCodec_StrictErrors(exc);
7506 Py_DECREF(exc);
7507 }
7508 Py_XDECREF(encoding_obj);
7509 return -1;
7510 }
7511
7512 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7513 pusedDefaultChar = &usedDefaultChar;
7514 else
7515 pusedDefaultChar = NULL;
7516
7517 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7518 PyErr_NoMemory();
7519 goto error;
7520 }
7521 outsize = insize * Py_ARRAY_LENGTH(buffer);
7522
7523 if (*outbytes == NULL) {
7524 /* Create string object */
7525 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7526 if (*outbytes == NULL)
7527 goto error;
7528 out = PyBytes_AS_STRING(*outbytes);
7529 }
7530 else {
7531 /* Extend string object */
7532 Py_ssize_t n = PyBytes_Size(*outbytes);
7533 if (n > PY_SSIZE_T_MAX - outsize) {
7534 PyErr_NoMemory();
7535 goto error;
7536 }
7537 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7538 goto error;
7539 out = PyBytes_AS_STRING(*outbytes) + n;
7540 }
7541
7542 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007543 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007544 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007545 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7546 wchar_t chars[2];
7547 int charsize;
7548 if (ch < 0x10000) {
7549 chars[0] = (wchar_t)ch;
7550 charsize = 1;
7551 }
7552 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007553 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7554 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007555 charsize = 2;
7556 }
7557
Victor Stinner3a50e702011-10-18 21:21:00 +02007558 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007559 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007560 buffer, Py_ARRAY_LENGTH(buffer),
7561 NULL, pusedDefaultChar);
7562 if (outsize > 0) {
7563 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7564 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007565 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007566 memcpy(out, buffer, outsize);
7567 out += outsize;
7568 continue;
7569 }
7570 }
7571 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7572 PyErr_SetFromWindowsErr(0);
7573 goto error;
7574 }
7575
Victor Stinner3a50e702011-10-18 21:21:00 +02007576 rep = unicode_encode_call_errorhandler(
7577 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007578 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007579 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007580 if (rep == NULL)
7581 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007582 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007583
7584 if (PyBytes_Check(rep)) {
7585 outsize = PyBytes_GET_SIZE(rep);
7586 if (outsize != 1) {
7587 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7588 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7589 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7590 Py_DECREF(rep);
7591 goto error;
7592 }
7593 out = PyBytes_AS_STRING(*outbytes) + offset;
7594 }
7595 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7596 out += outsize;
7597 }
7598 else {
7599 Py_ssize_t i;
7600 enum PyUnicode_Kind kind;
7601 void *data;
7602
Benjamin Petersonbac79492012-01-14 13:34:47 -05007603 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007604 Py_DECREF(rep);
7605 goto error;
7606 }
7607
7608 outsize = PyUnicode_GET_LENGTH(rep);
7609 if (outsize != 1) {
7610 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7611 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7612 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7613 Py_DECREF(rep);
7614 goto error;
7615 }
7616 out = PyBytes_AS_STRING(*outbytes) + offset;
7617 }
7618 kind = PyUnicode_KIND(rep);
7619 data = PyUnicode_DATA(rep);
7620 for (i=0; i < outsize; i++) {
7621 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7622 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007623 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007624 encoding, unicode,
7625 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007626 "unable to encode error handler result to ASCII");
7627 Py_DECREF(rep);
7628 goto error;
7629 }
7630 *out = (unsigned char)ch;
7631 out++;
7632 }
7633 }
7634 Py_DECREF(rep);
7635 }
7636 /* write a NUL byte */
7637 *out = 0;
7638 outsize = out - PyBytes_AS_STRING(*outbytes);
7639 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7640 if (_PyBytes_Resize(outbytes, outsize) < 0)
7641 goto error;
7642 ret = 0;
7643
7644error:
7645 Py_XDECREF(encoding_obj);
7646 Py_XDECREF(errorHandler);
7647 Py_XDECREF(exc);
7648 return ret;
7649}
7650
Victor Stinner3a50e702011-10-18 21:21:00 +02007651static PyObject *
7652encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007653 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007654 const char *errors)
7655{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007656 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007657 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007658 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007659 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007660
Victor Stinner29dacf22015-01-26 16:41:32 +01007661 if (!PyUnicode_Check(unicode)) {
7662 PyErr_BadArgument();
7663 return NULL;
7664 }
7665
Benjamin Petersonbac79492012-01-14 13:34:47 -05007666 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007667 return NULL;
7668 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007669
Victor Stinner3a50e702011-10-18 21:21:00 +02007670 if (code_page < 0) {
7671 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7672 return NULL;
7673 }
7674
Martin v. Löwis3d325192011-11-04 18:23:06 +01007675 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007676 return PyBytes_FromStringAndSize(NULL, 0);
7677
Victor Stinner7581cef2011-11-03 22:32:33 +01007678 offset = 0;
7679 do
7680 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007681#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007682 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007683 chunks. */
7684 if (len > INT_MAX/2) {
7685 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007686 done = 0;
7687 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007688 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007689#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007690 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007691 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007692 done = 1;
7693 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007694
Victor Stinner76a31a62011-11-04 00:05:13 +01007695 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007696 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007697 errors);
7698 if (ret == -2)
7699 ret = encode_code_page_errors(code_page, &outbytes,
7700 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007701 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007702 if (ret < 0) {
7703 Py_XDECREF(outbytes);
7704 return NULL;
7705 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007706
Victor Stinner7581cef2011-11-03 22:32:33 +01007707 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007708 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007709 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007710
Victor Stinner3a50e702011-10-18 21:21:00 +02007711 return outbytes;
7712}
7713
7714PyObject *
7715PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7716 Py_ssize_t size,
7717 const char *errors)
7718{
Victor Stinner7581cef2011-11-03 22:32:33 +01007719 PyObject *unicode, *res;
7720 unicode = PyUnicode_FromUnicode(p, size);
7721 if (unicode == NULL)
7722 return NULL;
7723 res = encode_code_page(CP_ACP, unicode, errors);
7724 Py_DECREF(unicode);
7725 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007726}
7727
7728PyObject *
7729PyUnicode_EncodeCodePage(int code_page,
7730 PyObject *unicode,
7731 const char *errors)
7732{
Victor Stinner7581cef2011-11-03 22:32:33 +01007733 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007734}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007735
Alexander Belopolsky40018472011-02-26 01:02:56 +00007736PyObject *
7737PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007738{
Victor Stinner7581cef2011-11-03 22:32:33 +01007739 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007740}
7741
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007742#undef NEED_RETRY
7743
Victor Stinner99b95382011-07-04 14:23:54 +02007744#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007745
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746/* --- Character Mapping Codec -------------------------------------------- */
7747
Victor Stinnerfb161b12013-04-18 01:44:27 +02007748static int
7749charmap_decode_string(const char *s,
7750 Py_ssize_t size,
7751 PyObject *mapping,
7752 const char *errors,
7753 _PyUnicodeWriter *writer)
7754{
7755 const char *starts = s;
7756 const char *e;
7757 Py_ssize_t startinpos, endinpos;
7758 PyObject *errorHandler = NULL, *exc = NULL;
7759 Py_ssize_t maplen;
7760 enum PyUnicode_Kind mapkind;
7761 void *mapdata;
7762 Py_UCS4 x;
7763 unsigned char ch;
7764
7765 if (PyUnicode_READY(mapping) == -1)
7766 return -1;
7767
7768 maplen = PyUnicode_GET_LENGTH(mapping);
7769 mapdata = PyUnicode_DATA(mapping);
7770 mapkind = PyUnicode_KIND(mapping);
7771
7772 e = s + size;
7773
7774 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7775 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7776 * is disabled in encoding aliases, latin1 is preferred because
7777 * its implementation is faster. */
7778 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7779 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7780 Py_UCS4 maxchar = writer->maxchar;
7781
7782 assert (writer->kind == PyUnicode_1BYTE_KIND);
7783 while (s < e) {
7784 ch = *s;
7785 x = mapdata_ucs1[ch];
7786 if (x > maxchar) {
7787 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7788 goto onError;
7789 maxchar = writer->maxchar;
7790 outdata = (Py_UCS1 *)writer->data;
7791 }
7792 outdata[writer->pos] = x;
7793 writer->pos++;
7794 ++s;
7795 }
7796 return 0;
7797 }
7798
7799 while (s < e) {
7800 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7801 enum PyUnicode_Kind outkind = writer->kind;
7802 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7803 if (outkind == PyUnicode_1BYTE_KIND) {
7804 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7805 Py_UCS4 maxchar = writer->maxchar;
7806 while (s < e) {
7807 ch = *s;
7808 x = mapdata_ucs2[ch];
7809 if (x > maxchar)
7810 goto Error;
7811 outdata[writer->pos] = x;
7812 writer->pos++;
7813 ++s;
7814 }
7815 break;
7816 }
7817 else if (outkind == PyUnicode_2BYTE_KIND) {
7818 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7819 while (s < e) {
7820 ch = *s;
7821 x = mapdata_ucs2[ch];
7822 if (x == 0xFFFE)
7823 goto Error;
7824 outdata[writer->pos] = x;
7825 writer->pos++;
7826 ++s;
7827 }
7828 break;
7829 }
7830 }
7831 ch = *s;
7832
7833 if (ch < maplen)
7834 x = PyUnicode_READ(mapkind, mapdata, ch);
7835 else
7836 x = 0xfffe; /* invalid value */
7837Error:
7838 if (x == 0xfffe)
7839 {
7840 /* undefined mapping */
7841 startinpos = s-starts;
7842 endinpos = startinpos+1;
7843 if (unicode_decode_call_errorhandler_writer(
7844 errors, &errorHandler,
7845 "charmap", "character maps to <undefined>",
7846 &starts, &e, &startinpos, &endinpos, &exc, &s,
7847 writer)) {
7848 goto onError;
7849 }
7850 continue;
7851 }
7852
7853 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7854 goto onError;
7855 ++s;
7856 }
7857 Py_XDECREF(errorHandler);
7858 Py_XDECREF(exc);
7859 return 0;
7860
7861onError:
7862 Py_XDECREF(errorHandler);
7863 Py_XDECREF(exc);
7864 return -1;
7865}
7866
7867static int
7868charmap_decode_mapping(const char *s,
7869 Py_ssize_t size,
7870 PyObject *mapping,
7871 const char *errors,
7872 _PyUnicodeWriter *writer)
7873{
7874 const char *starts = s;
7875 const char *e;
7876 Py_ssize_t startinpos, endinpos;
7877 PyObject *errorHandler = NULL, *exc = NULL;
7878 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007879 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007880
7881 e = s + size;
7882
7883 while (s < e) {
7884 ch = *s;
7885
7886 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7887 key = PyLong_FromLong((long)ch);
7888 if (key == NULL)
7889 goto onError;
7890
7891 item = PyObject_GetItem(mapping, key);
7892 Py_DECREF(key);
7893 if (item == NULL) {
7894 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7895 /* No mapping found means: mapping is undefined. */
7896 PyErr_Clear();
7897 goto Undefined;
7898 } else
7899 goto onError;
7900 }
7901
7902 /* Apply mapping */
7903 if (item == Py_None)
7904 goto Undefined;
7905 if (PyLong_Check(item)) {
7906 long value = PyLong_AS_LONG(item);
7907 if (value == 0xFFFE)
7908 goto Undefined;
7909 if (value < 0 || value > MAX_UNICODE) {
7910 PyErr_Format(PyExc_TypeError,
7911 "character mapping must be in range(0x%lx)",
7912 (unsigned long)MAX_UNICODE + 1);
7913 goto onError;
7914 }
7915
7916 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7917 goto onError;
7918 }
7919 else if (PyUnicode_Check(item)) {
7920 if (PyUnicode_READY(item) == -1)
7921 goto onError;
7922 if (PyUnicode_GET_LENGTH(item) == 1) {
7923 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7924 if (value == 0xFFFE)
7925 goto Undefined;
7926 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7927 goto onError;
7928 }
7929 else {
7930 writer->overallocate = 1;
7931 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7932 goto onError;
7933 }
7934 }
7935 else {
7936 /* wrong return value */
7937 PyErr_SetString(PyExc_TypeError,
7938 "character mapping must return integer, None or str");
7939 goto onError;
7940 }
7941 Py_CLEAR(item);
7942 ++s;
7943 continue;
7944
7945Undefined:
7946 /* undefined mapping */
7947 Py_CLEAR(item);
7948 startinpos = s-starts;
7949 endinpos = startinpos+1;
7950 if (unicode_decode_call_errorhandler_writer(
7951 errors, &errorHandler,
7952 "charmap", "character maps to <undefined>",
7953 &starts, &e, &startinpos, &endinpos, &exc, &s,
7954 writer)) {
7955 goto onError;
7956 }
7957 }
7958 Py_XDECREF(errorHandler);
7959 Py_XDECREF(exc);
7960 return 0;
7961
7962onError:
7963 Py_XDECREF(item);
7964 Py_XDECREF(errorHandler);
7965 Py_XDECREF(exc);
7966 return -1;
7967}
7968
Alexander Belopolsky40018472011-02-26 01:02:56 +00007969PyObject *
7970PyUnicode_DecodeCharmap(const char *s,
7971 Py_ssize_t size,
7972 PyObject *mapping,
7973 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007975 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007976
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977 /* Default to Latin-1 */
7978 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007982 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007983 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007984 writer.min_length = size;
7985 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007987
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007988 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007989 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7990 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007991 }
7992 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007993 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7994 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007996 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007997
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007999 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 return NULL;
8001}
8002
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008003/* Charmap encoding: the lookup table */
8004
Alexander Belopolsky40018472011-02-26 01:02:56 +00008005struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008006 PyObject_HEAD
8007 unsigned char level1[32];
8008 int count2, count3;
8009 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008010};
8011
8012static PyObject*
8013encoding_map_size(PyObject *obj, PyObject* args)
8014{
8015 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008016 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008018}
8019
8020static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008021 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 PyDoc_STR("Return the size (in bytes) of this object") },
8023 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008024};
8025
8026static void
8027encoding_map_dealloc(PyObject* o)
8028{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008029 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008030}
8031
8032static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008033 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 "EncodingMap", /*tp_name*/
8035 sizeof(struct encoding_map), /*tp_basicsize*/
8036 0, /*tp_itemsize*/
8037 /* methods */
8038 encoding_map_dealloc, /*tp_dealloc*/
8039 0, /*tp_print*/
8040 0, /*tp_getattr*/
8041 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008042 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 0, /*tp_repr*/
8044 0, /*tp_as_number*/
8045 0, /*tp_as_sequence*/
8046 0, /*tp_as_mapping*/
8047 0, /*tp_hash*/
8048 0, /*tp_call*/
8049 0, /*tp_str*/
8050 0, /*tp_getattro*/
8051 0, /*tp_setattro*/
8052 0, /*tp_as_buffer*/
8053 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8054 0, /*tp_doc*/
8055 0, /*tp_traverse*/
8056 0, /*tp_clear*/
8057 0, /*tp_richcompare*/
8058 0, /*tp_weaklistoffset*/
8059 0, /*tp_iter*/
8060 0, /*tp_iternext*/
8061 encoding_map_methods, /*tp_methods*/
8062 0, /*tp_members*/
8063 0, /*tp_getset*/
8064 0, /*tp_base*/
8065 0, /*tp_dict*/
8066 0, /*tp_descr_get*/
8067 0, /*tp_descr_set*/
8068 0, /*tp_dictoffset*/
8069 0, /*tp_init*/
8070 0, /*tp_alloc*/
8071 0, /*tp_new*/
8072 0, /*tp_free*/
8073 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008074};
8075
8076PyObject*
8077PyUnicode_BuildEncodingMap(PyObject* string)
8078{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008079 PyObject *result;
8080 struct encoding_map *mresult;
8081 int i;
8082 int need_dict = 0;
8083 unsigned char level1[32];
8084 unsigned char level2[512];
8085 unsigned char *mlevel1, *mlevel2, *mlevel3;
8086 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008087 int kind;
8088 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008089 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008090 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008091
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008092 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008093 PyErr_BadArgument();
8094 return NULL;
8095 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008096 kind = PyUnicode_KIND(string);
8097 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008098 length = PyUnicode_GET_LENGTH(string);
8099 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008100 memset(level1, 0xFF, sizeof level1);
8101 memset(level2, 0xFF, sizeof level2);
8102
8103 /* If there isn't a one-to-one mapping of NULL to \0,
8104 or if there are non-BMP characters, we need to use
8105 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008106 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008107 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008108 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008109 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008110 ch = PyUnicode_READ(kind, data, i);
8111 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008112 need_dict = 1;
8113 break;
8114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008115 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008116 /* unmapped character */
8117 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008118 l1 = ch >> 11;
8119 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008120 if (level1[l1] == 0xFF)
8121 level1[l1] = count2++;
8122 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008123 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008124 }
8125
8126 if (count2 >= 0xFF || count3 >= 0xFF)
8127 need_dict = 1;
8128
8129 if (need_dict) {
8130 PyObject *result = PyDict_New();
8131 PyObject *key, *value;
8132 if (!result)
8133 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008134 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008135 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008136 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008137 if (!key || !value)
8138 goto failed1;
8139 if (PyDict_SetItem(result, key, value) == -1)
8140 goto failed1;
8141 Py_DECREF(key);
8142 Py_DECREF(value);
8143 }
8144 return result;
8145 failed1:
8146 Py_XDECREF(key);
8147 Py_XDECREF(value);
8148 Py_DECREF(result);
8149 return NULL;
8150 }
8151
8152 /* Create a three-level trie */
8153 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8154 16*count2 + 128*count3 - 1);
8155 if (!result)
8156 return PyErr_NoMemory();
8157 PyObject_Init(result, &EncodingMapType);
8158 mresult = (struct encoding_map*)result;
8159 mresult->count2 = count2;
8160 mresult->count3 = count3;
8161 mlevel1 = mresult->level1;
8162 mlevel2 = mresult->level23;
8163 mlevel3 = mresult->level23 + 16*count2;
8164 memcpy(mlevel1, level1, 32);
8165 memset(mlevel2, 0xFF, 16*count2);
8166 memset(mlevel3, 0, 128*count3);
8167 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008168 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008169 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008170 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8171 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008172 /* unmapped character */
8173 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008174 o1 = ch>>11;
8175 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008176 i2 = 16*mlevel1[o1] + o2;
8177 if (mlevel2[i2] == 0xFF)
8178 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008179 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008180 i3 = 128*mlevel2[i2] + o3;
8181 mlevel3[i3] = i;
8182 }
8183 return result;
8184}
8185
8186static int
Victor Stinner22168992011-11-20 17:09:18 +01008187encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008188{
8189 struct encoding_map *map = (struct encoding_map*)mapping;
8190 int l1 = c>>11;
8191 int l2 = (c>>7) & 0xF;
8192 int l3 = c & 0x7F;
8193 int i;
8194
Victor Stinner22168992011-11-20 17:09:18 +01008195 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008197 if (c == 0)
8198 return 0;
8199 /* level 1*/
8200 i = map->level1[l1];
8201 if (i == 0xFF) {
8202 return -1;
8203 }
8204 /* level 2*/
8205 i = map->level23[16*i+l2];
8206 if (i == 0xFF) {
8207 return -1;
8208 }
8209 /* level 3 */
8210 i = map->level23[16*map->count2 + 128*i + l3];
8211 if (i == 0) {
8212 return -1;
8213 }
8214 return i;
8215}
8216
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008217/* Lookup the character ch in the mapping. If the character
8218 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008219 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008220static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008221charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222{
Christian Heimes217cfd12007-12-02 14:31:20 +00008223 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008224 PyObject *x;
8225
8226 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008228 x = PyObject_GetItem(mapping, w);
8229 Py_DECREF(w);
8230 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8232 /* No mapping found means: mapping is undefined. */
8233 PyErr_Clear();
8234 x = Py_None;
8235 Py_INCREF(x);
8236 return x;
8237 } else
8238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008240 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008242 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 long value = PyLong_AS_LONG(x);
8244 if (value < 0 || value > 255) {
8245 PyErr_SetString(PyExc_TypeError,
8246 "character mapping must be in range(256)");
8247 Py_DECREF(x);
8248 return NULL;
8249 }
8250 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008252 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 /* wrong return value */
8256 PyErr_Format(PyExc_TypeError,
8257 "character mapping must return integer, bytes or None, not %.400s",
8258 x->ob_type->tp_name);
8259 Py_DECREF(x);
8260 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261 }
8262}
8263
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008264static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008265charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008266{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008267 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8268 /* exponentially overallocate to minimize reallocations */
8269 if (requiredsize < 2*outsize)
8270 requiredsize = 2*outsize;
8271 if (_PyBytes_Resize(outobj, requiredsize))
8272 return -1;
8273 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008274}
8275
Benjamin Peterson14339b62009-01-31 16:36:08 +00008276typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008278} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008279/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008280 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008281 space is available. Return a new reference to the object that
8282 was put in the output buffer, or Py_None, if the mapping was undefined
8283 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008284 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008285static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008286charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008287 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008288{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008289 PyObject *rep;
8290 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008291 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008292
Christian Heimes90aa7642007-12-19 02:45:37 +00008293 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008294 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008296 if (res == -1)
8297 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 if (outsize<requiredsize)
8299 if (charmapencode_resize(outobj, outpos, requiredsize))
8300 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008301 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008302 outstart[(*outpos)++] = (char)res;
8303 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008304 }
8305
8306 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008307 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008309 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 Py_DECREF(rep);
8311 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008312 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 if (PyLong_Check(rep)) {
8314 Py_ssize_t requiredsize = *outpos+1;
8315 if (outsize<requiredsize)
8316 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8317 Py_DECREF(rep);
8318 return enc_EXCEPTION;
8319 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008320 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008322 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 else {
8324 const char *repchars = PyBytes_AS_STRING(rep);
8325 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8326 Py_ssize_t requiredsize = *outpos+repsize;
8327 if (outsize<requiredsize)
8328 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8329 Py_DECREF(rep);
8330 return enc_EXCEPTION;
8331 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008332 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 memcpy(outstart + *outpos, repchars, repsize);
8334 *outpos += repsize;
8335 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008336 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008337 Py_DECREF(rep);
8338 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008339}
8340
8341/* handle an error in PyUnicode_EncodeCharmap
8342 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008343static int
8344charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008345 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008346 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008347 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008348 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008349{
8350 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008351 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008352 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008353 enum PyUnicode_Kind kind;
8354 void *data;
8355 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008356 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008357 Py_ssize_t collstartpos = *inpos;
8358 Py_ssize_t collendpos = *inpos+1;
8359 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360 char *encoding = "charmap";
8361 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008362 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008363 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008364 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365
Benjamin Petersonbac79492012-01-14 13:34:47 -05008366 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008367 return -1;
8368 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369 /* find all unencodable characters */
8370 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008371 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008372 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008373 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008374 val = encoding_map_lookup(ch, mapping);
8375 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 break;
8377 ++collendpos;
8378 continue;
8379 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008380
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008381 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8382 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 if (rep==NULL)
8384 return -1;
8385 else if (rep!=Py_None) {
8386 Py_DECREF(rep);
8387 break;
8388 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008389 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391 }
8392 /* cache callback name lookup
8393 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008394 if (*error_handler == _Py_ERROR_UNKNOWN)
8395 *error_handler = get_error_handler(errors);
8396
8397 switch (*error_handler) {
8398 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008399 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008400 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008401
8402 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008403 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 x = charmapencode_output('?', mapping, res, respos);
8405 if (x==enc_EXCEPTION) {
8406 return -1;
8407 }
8408 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008409 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 return -1;
8411 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008412 }
8413 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008414 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008415 *inpos = collendpos;
8416 break;
Victor Stinner50149202015-09-22 00:26:54 +02008417
8418 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008419 /* generate replacement (temporarily (mis)uses p) */
8420 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 char buffer[2+29+1+1];
8422 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008423 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 for (cp = buffer; *cp; ++cp) {
8425 x = charmapencode_output(*cp, mapping, res, respos);
8426 if (x==enc_EXCEPTION)
8427 return -1;
8428 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008429 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 return -1;
8431 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432 }
8433 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008434 *inpos = collendpos;
8435 break;
Victor Stinner50149202015-09-22 00:26:54 +02008436
Benjamin Peterson14339b62009-01-31 16:36:08 +00008437 default:
Victor Stinner50149202015-09-22 00:26:54 +02008438 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008439 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008441 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008443 if (PyBytes_Check(repunicode)) {
8444 /* Directly copy bytes result to output. */
8445 Py_ssize_t outsize = PyBytes_Size(*res);
8446 Py_ssize_t requiredsize;
8447 repsize = PyBytes_Size(repunicode);
8448 requiredsize = *respos + repsize;
8449 if (requiredsize > outsize)
8450 /* Make room for all additional bytes. */
8451 if (charmapencode_resize(res, respos, requiredsize)) {
8452 Py_DECREF(repunicode);
8453 return -1;
8454 }
8455 memcpy(PyBytes_AsString(*res) + *respos,
8456 PyBytes_AsString(repunicode), repsize);
8457 *respos += repsize;
8458 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008459 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008460 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008461 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008462 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008463 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008464 Py_DECREF(repunicode);
8465 return -1;
8466 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008467 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008468 data = PyUnicode_DATA(repunicode);
8469 kind = PyUnicode_KIND(repunicode);
8470 for (index = 0; index < repsize; index++) {
8471 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8472 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008474 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 return -1;
8476 }
8477 else if (x==enc_FAILED) {
8478 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008479 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 return -1;
8481 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008482 }
8483 *inpos = newpos;
8484 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008485 }
8486 return 0;
8487}
8488
Alexander Belopolsky40018472011-02-26 01:02:56 +00008489PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008490_PyUnicode_EncodeCharmap(PyObject *unicode,
8491 PyObject *mapping,
8492 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008494 /* output object */
8495 PyObject *res = NULL;
8496 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008497 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008498 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008499 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008500 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008501 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008502 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008503 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008504 void *data;
8505 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506
Benjamin Petersonbac79492012-01-14 13:34:47 -05008507 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008508 return NULL;
8509 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008510 data = PyUnicode_DATA(unicode);
8511 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008512
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513 /* Default to Latin-1 */
8514 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008515 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517 /* allocate enough for a simple encoding without
8518 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008519 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 if (res == NULL)
8521 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008522 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008526 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008528 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 if (x==enc_EXCEPTION) /* error */
8530 goto onError;
8531 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008532 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008534 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 &res, &respos)) {
8536 goto onError;
8537 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008538 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 else
8540 /* done with this character => adjust input position */
8541 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008545 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008546 if (_PyBytes_Resize(&res, respos) < 0)
8547 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008548
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008549 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008550 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008551 return res;
8552
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008554 Py_XDECREF(res);
8555 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008556 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557 return NULL;
8558}
8559
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008560/* Deprecated */
8561PyObject *
8562PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8563 Py_ssize_t size,
8564 PyObject *mapping,
8565 const char *errors)
8566{
8567 PyObject *result;
8568 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8569 if (unicode == NULL)
8570 return NULL;
8571 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8572 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008573 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008574}
8575
Alexander Belopolsky40018472011-02-26 01:02:56 +00008576PyObject *
8577PyUnicode_AsCharmapString(PyObject *unicode,
8578 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579{
8580 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 PyErr_BadArgument();
8582 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008584 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585}
8586
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008588static void
8589make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008591 Py_ssize_t startpos, Py_ssize_t endpos,
8592 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008594 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 *exceptionObject = _PyUnicodeTranslateError_Create(
8596 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597 }
8598 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8600 goto onError;
8601 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8602 goto onError;
8603 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8604 goto onError;
8605 return;
8606 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008607 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608 }
8609}
8610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008611/* error handling callback helper:
8612 build arguments, call the callback and check the arguments,
8613 put the result into newpos and return the replacement string, which
8614 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008615static PyObject *
8616unicode_translate_call_errorhandler(const char *errors,
8617 PyObject **errorHandler,
8618 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008620 Py_ssize_t startpos, Py_ssize_t endpos,
8621 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008622{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02008623 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008624
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008625 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626 PyObject *restuple;
8627 PyObject *resunicode;
8628
8629 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008631 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633 }
8634
8635 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008637 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008639
8640 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008645 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 Py_DECREF(restuple);
8647 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008648 }
8649 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 &resunicode, &i_newpos)) {
8651 Py_DECREF(restuple);
8652 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008653 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008654 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008656 else
8657 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008658 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008659 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 Py_DECREF(restuple);
8661 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008662 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663 Py_INCREF(resunicode);
8664 Py_DECREF(restuple);
8665 return resunicode;
8666}
8667
8668/* Lookup the character ch in the mapping and put the result in result,
8669 which must be decrefed by the caller.
8670 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008671static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008672charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008673{
Christian Heimes217cfd12007-12-02 14:31:20 +00008674 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675 PyObject *x;
8676
8677 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679 x = PyObject_GetItem(mapping, w);
8680 Py_DECREF(w);
8681 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8683 /* No mapping found means: use 1:1 mapping. */
8684 PyErr_Clear();
8685 *result = NULL;
8686 return 0;
8687 } else
8688 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008689 }
8690 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 *result = x;
8692 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008693 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008694 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008696 if (value < 0 || value > MAX_UNICODE) {
8697 PyErr_Format(PyExc_ValueError,
8698 "character mapping must be in range(0x%x)",
8699 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 Py_DECREF(x);
8701 return -1;
8702 }
8703 *result = x;
8704 return 0;
8705 }
8706 else if (PyUnicode_Check(x)) {
8707 *result = x;
8708 return 0;
8709 }
8710 else {
8711 /* wrong return value */
8712 PyErr_SetString(PyExc_TypeError,
8713 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008714 Py_DECREF(x);
8715 return -1;
8716 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008717}
Victor Stinner1194ea02014-04-04 19:37:40 +02008718
8719/* lookup the character, write the result into the writer.
8720 Return 1 if the result was written into the writer, return 0 if the mapping
8721 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008722static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008723charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8724 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008725{
Victor Stinner1194ea02014-04-04 19:37:40 +02008726 PyObject *item;
8727
8728 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008730
8731 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008733 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008736 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008737 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008738
8739 if (item == Py_None) {
8740 Py_DECREF(item);
8741 return 0;
8742 }
8743
8744 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008745 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8746 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8747 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008748 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8749 Py_DECREF(item);
8750 return -1;
8751 }
8752 Py_DECREF(item);
8753 return 1;
8754 }
8755
8756 if (!PyUnicode_Check(item)) {
8757 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008758 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008759 }
8760
8761 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8762 Py_DECREF(item);
8763 return -1;
8764 }
8765
8766 Py_DECREF(item);
8767 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008768}
8769
Victor Stinner89a76ab2014-04-05 11:44:04 +02008770static int
8771unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8772 Py_UCS1 *translate)
8773{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008774 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008775 int ret = 0;
8776
Victor Stinner89a76ab2014-04-05 11:44:04 +02008777 if (charmaptranslate_lookup(ch, mapping, &item)) {
8778 return -1;
8779 }
8780
8781 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008782 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008783 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008784 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008785 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008786 /* not found => default to 1:1 mapping */
8787 translate[ch] = ch;
8788 return 1;
8789 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008790 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008791 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008792 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8793 used it */
8794 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008795 /* invalid character or character outside ASCII:
8796 skip the fast translate */
8797 goto exit;
8798 }
8799 translate[ch] = (Py_UCS1)replace;
8800 }
8801 else if (PyUnicode_Check(item)) {
8802 Py_UCS4 replace;
8803
8804 if (PyUnicode_READY(item) == -1) {
8805 Py_DECREF(item);
8806 return -1;
8807 }
8808 if (PyUnicode_GET_LENGTH(item) != 1)
8809 goto exit;
8810
8811 replace = PyUnicode_READ_CHAR(item, 0);
8812 if (replace > 127)
8813 goto exit;
8814 translate[ch] = (Py_UCS1)replace;
8815 }
8816 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008817 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008818 goto exit;
8819 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008820 ret = 1;
8821
Benjamin Peterson1365de72014-04-07 20:15:41 -04008822 exit:
8823 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008824 return ret;
8825}
8826
8827/* Fast path for ascii => ascii translation. Return 1 if the whole string
8828 was translated into writer, return 0 if the input string was partially
8829 translated into writer, raise an exception and return -1 on error. */
8830static int
8831unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008832 _PyUnicodeWriter *writer, int ignore,
8833 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008834{
Victor Stinner872b2912014-04-05 14:27:07 +02008835 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008836 Py_ssize_t len;
8837 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008838 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008839
Victor Stinner89a76ab2014-04-05 11:44:04 +02008840 len = PyUnicode_GET_LENGTH(input);
8841
Victor Stinner872b2912014-04-05 14:27:07 +02008842 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008843
8844 in = PyUnicode_1BYTE_DATA(input);
8845 end = in + len;
8846
8847 assert(PyUnicode_IS_ASCII(writer->buffer));
8848 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8849 out = PyUnicode_1BYTE_DATA(writer->buffer);
8850
Victor Stinner872b2912014-04-05 14:27:07 +02008851 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008852 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008853 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008854 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008855 int translate = unicode_fast_translate_lookup(mapping, ch,
8856 ascii_table);
8857 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008858 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008859 if (translate == 0)
8860 goto exit;
8861 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008862 }
Victor Stinner872b2912014-04-05 14:27:07 +02008863 if (ch2 == 0xfe) {
8864 if (ignore)
8865 continue;
8866 goto exit;
8867 }
8868 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008869 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008870 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008871 }
Victor Stinner872b2912014-04-05 14:27:07 +02008872 res = 1;
8873
8874exit:
8875 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008876 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008877 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008878}
8879
Victor Stinner3222da22015-10-01 22:07:32 +02008880static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008881_PyUnicode_TranslateCharmap(PyObject *input,
8882 PyObject *mapping,
8883 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008886 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008887 Py_ssize_t size, i;
8888 int kind;
8889 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008890 _PyUnicodeWriter writer;
8891 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008892 char *reason = "character maps to <undefined>";
8893 PyObject *errorHandler = NULL;
8894 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008895 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008896 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008897
Guido van Rossumd57fd912000-03-10 22:53:23 +00008898 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008899 PyErr_BadArgument();
8900 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 if (PyUnicode_READY(input) == -1)
8904 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008905 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906 kind = PyUnicode_KIND(input);
8907 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008909 if (size == 0)
8910 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008911
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008912 /* allocate enough for a simple 1:1 translation without
8913 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008914 _PyUnicodeWriter_Init(&writer);
8915 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008917
Victor Stinner872b2912014-04-05 14:27:07 +02008918 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8919
Victor Stinner33798672016-03-01 21:59:58 +01008920 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008921 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008922 if (PyUnicode_IS_ASCII(input)) {
8923 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8924 if (res < 0) {
8925 _PyUnicodeWriter_Dealloc(&writer);
8926 return NULL;
8927 }
8928 if (res == 1)
8929 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008930 }
Victor Stinner33798672016-03-01 21:59:58 +01008931 else {
8932 i = 0;
8933 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008936 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008937 int translate;
8938 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8939 Py_ssize_t newpos;
8940 /* startpos for collecting untranslatable chars */
8941 Py_ssize_t collstart;
8942 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008943 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944
Victor Stinner1194ea02014-04-04 19:37:40 +02008945 ch = PyUnicode_READ(kind, data, i);
8946 translate = charmaptranslate_output(ch, mapping, &writer);
8947 if (translate < 0)
8948 goto onError;
8949
8950 if (translate != 0) {
8951 /* it worked => adjust input pointer */
8952 ++i;
8953 continue;
8954 }
8955
8956 /* untranslatable character */
8957 collstart = i;
8958 collend = i+1;
8959
8960 /* find all untranslatable characters */
8961 while (collend < size) {
8962 PyObject *x;
8963 ch = PyUnicode_READ(kind, data, collend);
8964 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008965 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008966 Py_XDECREF(x);
8967 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008968 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008969 ++collend;
8970 }
8971
8972 if (ignore) {
8973 i = collend;
8974 }
8975 else {
8976 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8977 reason, input, &exc,
8978 collstart, collend, &newpos);
8979 if (repunicode == NULL)
8980 goto onError;
8981 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008982 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008983 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008984 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008985 Py_DECREF(repunicode);
8986 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008987 }
8988 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008989 Py_XDECREF(exc);
8990 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008991 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992
Benjamin Peterson29060642009-01-31 22:14:21 +00008993 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008994 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008995 Py_XDECREF(exc);
8996 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997 return NULL;
8998}
8999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000/* Deprecated. Use PyUnicode_Translate instead. */
9001PyObject *
9002PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9003 Py_ssize_t size,
9004 PyObject *mapping,
9005 const char *errors)
9006{
Christian Heimes5f520f42012-09-11 14:03:25 +02009007 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008 PyObject *unicode = PyUnicode_FromUnicode(p, size);
9009 if (!unicode)
9010 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009011 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9012 Py_DECREF(unicode);
9013 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014}
9015
Alexander Belopolsky40018472011-02-26 01:02:56 +00009016PyObject *
9017PyUnicode_Translate(PyObject *str,
9018 PyObject *mapping,
9019 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009021 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009022 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009023 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024}
Tim Petersced69f82003-09-16 20:30:58 +00009025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009027fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028{
9029 /* No need to call PyUnicode_READY(self) because this function is only
9030 called as a callback from fixup() which does it already. */
9031 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9032 const int kind = PyUnicode_KIND(self);
9033 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009034 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009035 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009036 Py_ssize_t i;
9037
9038 for (i = 0; i < len; ++i) {
9039 ch = PyUnicode_READ(kind, data, i);
9040 fixed = 0;
9041 if (ch > 127) {
9042 if (Py_UNICODE_ISSPACE(ch))
9043 fixed = ' ';
9044 else {
9045 const int decimal = Py_UNICODE_TODECIMAL(ch);
9046 if (decimal >= 0)
9047 fixed = '0' + decimal;
9048 }
9049 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009050 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009051 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 PyUnicode_WRITE(kind, data, i, fixed);
9053 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009054 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009055 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 }
9058
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009059 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060}
9061
9062PyObject *
9063_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9064{
9065 if (!PyUnicode_Check(unicode)) {
9066 PyErr_BadInternalCall();
9067 return NULL;
9068 }
9069 if (PyUnicode_READY(unicode) == -1)
9070 return NULL;
9071 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9072 /* If the string is already ASCII, just return the same string */
9073 Py_INCREF(unicode);
9074 return unicode;
9075 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009076 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077}
9078
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009079PyObject *
9080PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9081 Py_ssize_t length)
9082{
Victor Stinnerf0124502011-11-21 23:12:56 +01009083 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009084 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009085 Py_UCS4 maxchar;
9086 enum PyUnicode_Kind kind;
9087 void *data;
9088
Victor Stinner99d7ad02012-02-22 13:37:39 +01009089 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009090 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009091 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009092 if (ch > 127) {
9093 int decimal = Py_UNICODE_TODECIMAL(ch);
9094 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009095 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009096 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009097 }
9098 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009099
9100 /* Copy to a new string */
9101 decimal = PyUnicode_New(length, maxchar);
9102 if (decimal == NULL)
9103 return decimal;
9104 kind = PyUnicode_KIND(decimal);
9105 data = PyUnicode_DATA(decimal);
9106 /* Iterate over code points */
9107 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009108 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009109 if (ch > 127) {
9110 int decimal = Py_UNICODE_TODECIMAL(ch);
9111 if (decimal >= 0)
9112 ch = '0' + decimal;
9113 }
9114 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009115 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009116 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009117}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009118/* --- Decimal Encoder ---------------------------------------------------- */
9119
Alexander Belopolsky40018472011-02-26 01:02:56 +00009120int
9121PyUnicode_EncodeDecimal(Py_UNICODE *s,
9122 Py_ssize_t length,
9123 char *output,
9124 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009125{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009126 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009127 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009128 enum PyUnicode_Kind kind;
9129 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009130
9131 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 PyErr_BadArgument();
9133 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009134 }
9135
Victor Stinner42bf7752011-11-21 22:52:58 +01009136 unicode = PyUnicode_FromUnicode(s, length);
9137 if (unicode == NULL)
9138 return -1;
9139
Benjamin Petersonbac79492012-01-14 13:34:47 -05009140 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009141 Py_DECREF(unicode);
9142 return -1;
9143 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009144 kind = PyUnicode_KIND(unicode);
9145 data = PyUnicode_DATA(unicode);
9146
Victor Stinnerb84d7232011-11-22 01:50:07 +01009147 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009148 PyObject *exc;
9149 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009150 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009151 Py_ssize_t startpos;
9152
9153 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009154
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009156 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009157 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009158 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009159 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009160 decimal = Py_UNICODE_TODECIMAL(ch);
9161 if (decimal >= 0) {
9162 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009163 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009164 continue;
9165 }
9166 if (0 < ch && ch < 256) {
9167 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009168 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009169 continue;
9170 }
Victor Stinner6345be92011-11-25 20:09:01 +01009171
Victor Stinner42bf7752011-11-21 22:52:58 +01009172 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009173 exc = NULL;
9174 raise_encode_exception(&exc, "decimal", unicode,
9175 startpos, startpos+1,
9176 "invalid decimal Unicode string");
9177 Py_XDECREF(exc);
9178 Py_DECREF(unicode);
9179 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009180 }
9181 /* 0-terminate the output string */
9182 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009183 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009184 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009185}
9186
Guido van Rossumd57fd912000-03-10 22:53:23 +00009187/* --- Helpers ------------------------------------------------------------ */
9188
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009189/* helper macro to fixup start/end slice values */
9190#define ADJUST_INDICES(start, end, len) \
9191 if (end > len) \
9192 end = len; \
9193 else if (end < 0) { \
9194 end += len; \
9195 if (end < 0) \
9196 end = 0; \
9197 } \
9198 if (start < 0) { \
9199 start += len; \
9200 if (start < 0) \
9201 start = 0; \
9202 }
9203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009204static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009205any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009206 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009207 Py_ssize_t end,
9208 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009209{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009210 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009211 void *buf1, *buf2;
9212 Py_ssize_t len1, len2, result;
9213
9214 kind1 = PyUnicode_KIND(s1);
9215 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009216 if (kind1 < kind2)
9217 return -1;
9218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219 len1 = PyUnicode_GET_LENGTH(s1);
9220 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009221 ADJUST_INDICES(start, end, len1);
9222 if (end - start < len2)
9223 return -1;
9224
9225 buf1 = PyUnicode_DATA(s1);
9226 buf2 = PyUnicode_DATA(s2);
9227 if (len2 == 1) {
9228 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9229 result = findchar((const char *)buf1 + kind1*start,
9230 kind1, end - start, ch, direction);
9231 if (result == -1)
9232 return -1;
9233 else
9234 return start + result;
9235 }
9236
9237 if (kind2 != kind1) {
9238 buf2 = _PyUnicode_AsKind(s2, kind1);
9239 if (!buf2)
9240 return -2;
9241 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242
Victor Stinner794d5672011-10-10 03:21:36 +02009243 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009244 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009245 case PyUnicode_1BYTE_KIND:
9246 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9247 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9248 else
9249 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9250 break;
9251 case PyUnicode_2BYTE_KIND:
9252 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9253 break;
9254 case PyUnicode_4BYTE_KIND:
9255 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9256 break;
9257 default:
9258 assert(0); result = -2;
9259 }
9260 }
9261 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009262 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009263 case PyUnicode_1BYTE_KIND:
9264 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9265 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9266 else
9267 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9268 break;
9269 case PyUnicode_2BYTE_KIND:
9270 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9271 break;
9272 case PyUnicode_4BYTE_KIND:
9273 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9274 break;
9275 default:
9276 assert(0); result = -2;
9277 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278 }
9279
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009280 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 PyMem_Free(buf2);
9282
9283 return result;
9284}
9285
9286Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009287_PyUnicode_InsertThousandsGrouping(
9288 PyObject *unicode, Py_ssize_t index,
9289 Py_ssize_t n_buffer,
9290 void *digits, Py_ssize_t n_digits,
9291 Py_ssize_t min_width,
9292 const char *grouping, PyObject *thousands_sep,
9293 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294{
Victor Stinner41a863c2012-02-24 00:37:51 +01009295 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009296 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009297 Py_ssize_t thousands_sep_len;
9298 Py_ssize_t len;
9299
9300 if (unicode != NULL) {
9301 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009302 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009303 }
9304 else {
9305 kind = PyUnicode_1BYTE_KIND;
9306 data = NULL;
9307 }
9308 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9309 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9310 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9311 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009312 if (thousands_sep_kind < kind) {
9313 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9314 if (!thousands_sep_data)
9315 return -1;
9316 }
9317 else {
9318 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9319 if (!data)
9320 return -1;
9321 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009322 }
9323
Benjamin Petersonead6b532011-12-20 17:23:42 -06009324 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009325 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009326 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009327 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009328 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009329 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009330 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009331 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009332 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009333 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009334 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009335 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009336 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009338 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009339 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009340 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009341 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009342 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009344 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009345 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009346 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009347 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009348 break;
9349 default:
9350 assert(0);
9351 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009353 if (unicode != NULL && thousands_sep_kind != kind) {
9354 if (thousands_sep_kind < kind)
9355 PyMem_Free(thousands_sep_data);
9356 else
9357 PyMem_Free(data);
9358 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009359 if (unicode == NULL) {
9360 *maxchar = 127;
9361 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009362 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009363 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009364 }
9365 }
9366 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367}
9368
9369
Alexander Belopolsky40018472011-02-26 01:02:56 +00009370Py_ssize_t
9371PyUnicode_Count(PyObject *str,
9372 PyObject *substr,
9373 Py_ssize_t start,
9374 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009376 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009377 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 void *buf1 = NULL, *buf2 = NULL;
9379 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009380
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009381 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009382 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009383
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009384 kind1 = PyUnicode_KIND(str);
9385 kind2 = PyUnicode_KIND(substr);
9386 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009387 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009388
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009389 len1 = PyUnicode_GET_LENGTH(str);
9390 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009392 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009393 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009394
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009395 buf1 = PyUnicode_DATA(str);
9396 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009397 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009398 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009399 if (!buf2)
9400 goto onError;
9401 }
9402
9403 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009405 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009406 result = asciilib_count(
9407 ((Py_UCS1*)buf1) + start, end - start,
9408 buf2, len2, PY_SSIZE_T_MAX
9409 );
9410 else
9411 result = ucs1lib_count(
9412 ((Py_UCS1*)buf1) + start, end - start,
9413 buf2, len2, PY_SSIZE_T_MAX
9414 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 break;
9416 case PyUnicode_2BYTE_KIND:
9417 result = ucs2lib_count(
9418 ((Py_UCS2*)buf1) + start, end - start,
9419 buf2, len2, PY_SSIZE_T_MAX
9420 );
9421 break;
9422 case PyUnicode_4BYTE_KIND:
9423 result = ucs4lib_count(
9424 ((Py_UCS4*)buf1) + start, end - start,
9425 buf2, len2, PY_SSIZE_T_MAX
9426 );
9427 break;
9428 default:
9429 assert(0); result = 0;
9430 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009431
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009432 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 PyMem_Free(buf2);
9434
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009437 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 PyMem_Free(buf2);
9439 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440}
9441
Alexander Belopolsky40018472011-02-26 01:02:56 +00009442Py_ssize_t
9443PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009444 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009445 Py_ssize_t start,
9446 Py_ssize_t end,
9447 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009449 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009450 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009451
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009452 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453}
9454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455Py_ssize_t
9456PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9457 Py_ssize_t start, Py_ssize_t end,
9458 int direction)
9459{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009461 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 if (PyUnicode_READY(str) == -1)
9463 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009464 if (start < 0 || end < 0) {
9465 PyErr_SetString(PyExc_IndexError, "string index out of range");
9466 return -2;
9467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 if (end > PyUnicode_GET_LENGTH(str))
9469 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009470 if (start >= end)
9471 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009473 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9474 kind, end-start, ch, direction);
9475 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009477 else
9478 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479}
9480
Alexander Belopolsky40018472011-02-26 01:02:56 +00009481static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009482tailmatch(PyObject *self,
9483 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009484 Py_ssize_t start,
9485 Py_ssize_t end,
9486 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 int kind_self;
9489 int kind_sub;
9490 void *data_self;
9491 void *data_sub;
9492 Py_ssize_t offset;
9493 Py_ssize_t i;
9494 Py_ssize_t end_sub;
9495
9496 if (PyUnicode_READY(self) == -1 ||
9497 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009498 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9501 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009503 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009505 if (PyUnicode_GET_LENGTH(substring) == 0)
9506 return 1;
9507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 kind_self = PyUnicode_KIND(self);
9509 data_self = PyUnicode_DATA(self);
9510 kind_sub = PyUnicode_KIND(substring);
9511 data_sub = PyUnicode_DATA(substring);
9512 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9513
9514 if (direction > 0)
9515 offset = end;
9516 else
9517 offset = start;
9518
9519 if (PyUnicode_READ(kind_self, data_self, offset) ==
9520 PyUnicode_READ(kind_sub, data_sub, 0) &&
9521 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9522 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9523 /* If both are of the same kind, memcmp is sufficient */
9524 if (kind_self == kind_sub) {
9525 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009526 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009527 data_sub,
9528 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009529 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009531 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 else {
9533 /* We do not need to compare 0 and len(substring)-1 because
9534 the if statement above ensured already that they are equal
9535 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 for (i = 1; i < end_sub; ++i) {
9537 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9538 PyUnicode_READ(kind_sub, data_sub, i))
9539 return 0;
9540 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009541 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009542 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009543 }
9544
9545 return 0;
9546}
9547
Alexander Belopolsky40018472011-02-26 01:02:56 +00009548Py_ssize_t
9549PyUnicode_Tailmatch(PyObject *str,
9550 PyObject *substr,
9551 Py_ssize_t start,
9552 Py_ssize_t end,
9553 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009554{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009555 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009556 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009557
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009558 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009559}
9560
Guido van Rossumd57fd912000-03-10 22:53:23 +00009561/* Apply fixfct filter to the Unicode object self and return a
9562 reference to the modified object */
9563
Alexander Belopolsky40018472011-02-26 01:02:56 +00009564static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009565fixup(PyObject *self,
9566 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009567{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 PyObject *u;
9569 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009570 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009572 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009574 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009575 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 /* fix functions return the new maximum character in a string,
9578 if the kind of the resulting unicode object does not change,
9579 everything is fine. Otherwise we need to change the string kind
9580 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009581 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009582
9583 if (maxchar_new == 0) {
9584 /* no changes */;
9585 if (PyUnicode_CheckExact(self)) {
9586 Py_DECREF(u);
9587 Py_INCREF(self);
9588 return self;
9589 }
9590 else
9591 return u;
9592 }
9593
Victor Stinnere6abb482012-05-02 01:15:40 +02009594 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595
Victor Stinnereaab6042011-12-11 22:22:39 +01009596 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009597 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009598
9599 /* In case the maximum character changed, we need to
9600 convert the string to the new category. */
9601 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9602 if (v == NULL) {
9603 Py_DECREF(u);
9604 return NULL;
9605 }
9606 if (maxchar_new > maxchar_old) {
9607 /* If the maxchar increased so that the kind changed, not all
9608 characters are representable anymore and we need to fix the
9609 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009610 _PyUnicode_FastCopyCharacters(v, 0,
9611 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009612 maxchar_old = fixfct(v);
9613 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009614 }
9615 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009616 _PyUnicode_FastCopyCharacters(v, 0,
9617 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009619 Py_DECREF(u);
9620 assert(_PyUnicode_CheckConsistency(v, 1));
9621 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009622}
9623
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009624static PyObject *
9625ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009627 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9628 char *resdata, *data = PyUnicode_DATA(self);
9629 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009630
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009631 res = PyUnicode_New(len, 127);
9632 if (res == NULL)
9633 return NULL;
9634 resdata = PyUnicode_DATA(res);
9635 if (lower)
9636 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009638 _Py_bytes_upper(resdata, data, len);
9639 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640}
9641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009643handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009645 Py_ssize_t j;
9646 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009647 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009648 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009649
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009650 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9651
9652 where ! is a negation and \p{xxx} is a character with property xxx.
9653 */
9654 for (j = i - 1; j >= 0; j--) {
9655 c = PyUnicode_READ(kind, data, j);
9656 if (!_PyUnicode_IsCaseIgnorable(c))
9657 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009659 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9660 if (final_sigma) {
9661 for (j = i + 1; j < length; j++) {
9662 c = PyUnicode_READ(kind, data, j);
9663 if (!_PyUnicode_IsCaseIgnorable(c))
9664 break;
9665 }
9666 final_sigma = j == length || !_PyUnicode_IsCased(c);
9667 }
9668 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009669}
9670
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009671static int
9672lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9673 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009675 /* Obscure special case. */
9676 if (c == 0x3A3) {
9677 mapped[0] = handle_capital_sigma(kind, data, length, i);
9678 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009680 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681}
9682
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009683static Py_ssize_t
9684do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009686 Py_ssize_t i, k = 0;
9687 int n_res, j;
9688 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009689
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009690 c = PyUnicode_READ(kind, data, 0);
9691 n_res = _PyUnicode_ToUpperFull(c, mapped);
9692 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009693 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009694 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009696 for (i = 1; i < length; i++) {
9697 c = PyUnicode_READ(kind, data, i);
9698 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9699 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009700 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009701 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009702 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009703 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009704 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705}
9706
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009707static Py_ssize_t
9708do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9709 Py_ssize_t i, k = 0;
9710
9711 for (i = 0; i < length; i++) {
9712 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9713 int n_res, j;
9714 if (Py_UNICODE_ISUPPER(c)) {
9715 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9716 }
9717 else if (Py_UNICODE_ISLOWER(c)) {
9718 n_res = _PyUnicode_ToUpperFull(c, mapped);
9719 }
9720 else {
9721 n_res = 1;
9722 mapped[0] = c;
9723 }
9724 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009725 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009726 res[k++] = mapped[j];
9727 }
9728 }
9729 return k;
9730}
9731
9732static Py_ssize_t
9733do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9734 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009736 Py_ssize_t i, k = 0;
9737
9738 for (i = 0; i < length; i++) {
9739 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9740 int n_res, j;
9741 if (lower)
9742 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9743 else
9744 n_res = _PyUnicode_ToUpperFull(c, mapped);
9745 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009746 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009747 res[k++] = mapped[j];
9748 }
9749 }
9750 return k;
9751}
9752
9753static Py_ssize_t
9754do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9755{
9756 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9757}
9758
9759static Py_ssize_t
9760do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9761{
9762 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9763}
9764
Benjamin Petersone51757f2012-01-12 21:10:29 -05009765static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009766do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9767{
9768 Py_ssize_t i, k = 0;
9769
9770 for (i = 0; i < length; i++) {
9771 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9772 Py_UCS4 mapped[3];
9773 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9774 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009775 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009776 res[k++] = mapped[j];
9777 }
9778 }
9779 return k;
9780}
9781
9782static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009783do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9784{
9785 Py_ssize_t i, k = 0;
9786 int previous_is_cased;
9787
9788 previous_is_cased = 0;
9789 for (i = 0; i < length; i++) {
9790 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9791 Py_UCS4 mapped[3];
9792 int n_res, j;
9793
9794 if (previous_is_cased)
9795 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9796 else
9797 n_res = _PyUnicode_ToTitleFull(c, mapped);
9798
9799 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009800 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009801 res[k++] = mapped[j];
9802 }
9803
9804 previous_is_cased = _PyUnicode_IsCased(c);
9805 }
9806 return k;
9807}
9808
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009809static PyObject *
9810case_operation(PyObject *self,
9811 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9812{
9813 PyObject *res = NULL;
9814 Py_ssize_t length, newlength = 0;
9815 int kind, outkind;
9816 void *data, *outdata;
9817 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9818
Benjamin Petersoneea48462012-01-16 14:28:50 -05009819 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009820
9821 kind = PyUnicode_KIND(self);
9822 data = PyUnicode_DATA(self);
9823 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009824 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009825 PyErr_SetString(PyExc_OverflowError, "string is too long");
9826 return NULL;
9827 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009828 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009829 if (tmp == NULL)
9830 return PyErr_NoMemory();
9831 newlength = perform(kind, data, length, tmp, &maxchar);
9832 res = PyUnicode_New(newlength, maxchar);
9833 if (res == NULL)
9834 goto leave;
9835 tmpend = tmp + newlength;
9836 outdata = PyUnicode_DATA(res);
9837 outkind = PyUnicode_KIND(res);
9838 switch (outkind) {
9839 case PyUnicode_1BYTE_KIND:
9840 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9841 break;
9842 case PyUnicode_2BYTE_KIND:
9843 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9844 break;
9845 case PyUnicode_4BYTE_KIND:
9846 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9847 break;
9848 default:
9849 assert(0);
9850 break;
9851 }
9852 leave:
9853 PyMem_FREE(tmp);
9854 return res;
9855}
9856
Tim Peters8ce9f162004-08-27 01:49:32 +00009857PyObject *
9858PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009860 PyObject *res;
9861 PyObject *fseq;
9862 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009863 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009865 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009866 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009867 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009868 }
9869
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009870 /* NOTE: the following code can't call back into Python code,
9871 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009872 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009873
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009874 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009875 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009876 res = _PyUnicode_JoinArray(separator, items, seqlen);
9877 Py_DECREF(fseq);
9878 return res;
9879}
9880
9881PyObject *
9882_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9883{
9884 PyObject *res = NULL; /* the result */
9885 PyObject *sep = NULL;
9886 Py_ssize_t seplen;
9887 PyObject *item;
9888 Py_ssize_t sz, i, res_offset;
9889 Py_UCS4 maxchar;
9890 Py_UCS4 item_maxchar;
9891 int use_memcpy;
9892 unsigned char *res_data = NULL, *sep_data = NULL;
9893 PyObject *last_obj;
9894 unsigned int kind = 0;
9895
Tim Peters05eba1f2004-08-27 21:32:02 +00009896 /* If empty sequence, return u"". */
9897 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009898 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009899 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009900
Tim Peters05eba1f2004-08-27 21:32:02 +00009901 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009902 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009903 if (seqlen == 1) {
9904 if (PyUnicode_CheckExact(items[0])) {
9905 res = items[0];
9906 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009907 return res;
9908 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009909 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009910 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009911 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009912 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009913 /* Set up sep and seplen */
9914 if (separator == NULL) {
9915 /* fall back to a blank space separator */
9916 sep = PyUnicode_FromOrdinal(' ');
9917 if (!sep)
9918 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009919 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009920 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009921 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009922 else {
9923 if (!PyUnicode_Check(separator)) {
9924 PyErr_Format(PyExc_TypeError,
9925 "separator: expected str instance,"
9926 " %.80s found",
9927 Py_TYPE(separator)->tp_name);
9928 goto onError;
9929 }
9930 if (PyUnicode_READY(separator))
9931 goto onError;
9932 sep = separator;
9933 seplen = PyUnicode_GET_LENGTH(separator);
9934 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9935 /* inc refcount to keep this code path symmetric with the
9936 above case of a blank separator */
9937 Py_INCREF(sep);
9938 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009939 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009940 }
9941
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009942 /* There are at least two things to join, or else we have a subclass
9943 * of str in the sequence.
9944 * Do a pre-pass to figure out the total amount of space we'll
9945 * need (sz), and see whether all argument are strings.
9946 */
9947 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009948#ifdef Py_DEBUG
9949 use_memcpy = 0;
9950#else
9951 use_memcpy = 1;
9952#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009953 for (i = 0; i < seqlen; i++) {
9954 const Py_ssize_t old_sz = sz;
9955 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009956 if (!PyUnicode_Check(item)) {
9957 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009958 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009959 " %.80s found",
9960 i, Py_TYPE(item)->tp_name);
9961 goto onError;
9962 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963 if (PyUnicode_READY(item) == -1)
9964 goto onError;
9965 sz += PyUnicode_GET_LENGTH(item);
9966 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009967 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009968 if (i != 0)
9969 sz += seplen;
9970 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9971 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009972 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009973 goto onError;
9974 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009975 if (use_memcpy && last_obj != NULL) {
9976 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9977 use_memcpy = 0;
9978 }
9979 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009980 }
Tim Petersced69f82003-09-16 20:30:58 +00009981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009983 if (res == NULL)
9984 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009985
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009986 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009987#ifdef Py_DEBUG
9988 use_memcpy = 0;
9989#else
9990 if (use_memcpy) {
9991 res_data = PyUnicode_1BYTE_DATA(res);
9992 kind = PyUnicode_KIND(res);
9993 if (seplen != 0)
9994 sep_data = PyUnicode_1BYTE_DATA(sep);
9995 }
9996#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009997 if (use_memcpy) {
9998 for (i = 0; i < seqlen; ++i) {
9999 Py_ssize_t itemlen;
10000 item = items[i];
10001
10002 /* Copy item, and maybe the separator. */
10003 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010004 Py_MEMCPY(res_data,
10005 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010006 kind * seplen);
10007 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010008 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010009
10010 itemlen = PyUnicode_GET_LENGTH(item);
10011 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010012 Py_MEMCPY(res_data,
10013 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010014 kind * itemlen);
10015 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010016 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010017 }
10018 assert(res_data == PyUnicode_1BYTE_DATA(res)
10019 + kind * PyUnicode_GET_LENGTH(res));
10020 }
10021 else {
10022 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10023 Py_ssize_t itemlen;
10024 item = items[i];
10025
10026 /* Copy item, and maybe the separator. */
10027 if (i && seplen != 0) {
10028 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10029 res_offset += seplen;
10030 }
10031
10032 itemlen = PyUnicode_GET_LENGTH(item);
10033 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010034 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010035 res_offset += itemlen;
10036 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010037 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010038 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010039 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010042 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010044
Benjamin Peterson29060642009-01-31 22:14:21 +000010045 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010047 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010048 return NULL;
10049}
10050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051#define FILL(kind, data, value, start, length) \
10052 do { \
10053 Py_ssize_t i_ = 0; \
10054 assert(kind != PyUnicode_WCHAR_KIND); \
10055 switch ((kind)) { \
10056 case PyUnicode_1BYTE_KIND: { \
10057 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010058 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 break; \
10060 } \
10061 case PyUnicode_2BYTE_KIND: { \
10062 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10063 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10064 break; \
10065 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010066 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10068 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10069 break; \
10070 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010071 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 } \
10073 } while (0)
10074
Victor Stinnerd3f08822012-05-29 12:57:52 +020010075void
10076_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10077 Py_UCS4 fill_char)
10078{
10079 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10080 const void *data = PyUnicode_DATA(unicode);
10081 assert(PyUnicode_IS_READY(unicode));
10082 assert(unicode_modifiable(unicode));
10083 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10084 assert(start >= 0);
10085 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10086 FILL(kind, data, fill_char, start, length);
10087}
10088
Victor Stinner3fe55312012-01-04 00:33:50 +010010089Py_ssize_t
10090PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10091 Py_UCS4 fill_char)
10092{
10093 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010094
10095 if (!PyUnicode_Check(unicode)) {
10096 PyErr_BadInternalCall();
10097 return -1;
10098 }
10099 if (PyUnicode_READY(unicode) == -1)
10100 return -1;
10101 if (unicode_check_modifiable(unicode))
10102 return -1;
10103
Victor Stinnerd3f08822012-05-29 12:57:52 +020010104 if (start < 0) {
10105 PyErr_SetString(PyExc_IndexError, "string index out of range");
10106 return -1;
10107 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010108 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10109 PyErr_SetString(PyExc_ValueError,
10110 "fill character is bigger than "
10111 "the string maximum character");
10112 return -1;
10113 }
10114
10115 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10116 length = Py_MIN(maxlen, length);
10117 if (length <= 0)
10118 return 0;
10119
Victor Stinnerd3f08822012-05-29 12:57:52 +020010120 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010121 return length;
10122}
10123
Victor Stinner9310abb2011-10-05 00:59:23 +020010124static PyObject *
10125pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010126 Py_ssize_t left,
10127 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010129{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 PyObject *u;
10131 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010132 int kind;
10133 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010134
10135 if (left < 0)
10136 left = 0;
10137 if (right < 0)
10138 right = 0;
10139
Victor Stinnerc4b49542011-12-11 22:44:26 +010010140 if (left == 0 && right == 0)
10141 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10144 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010145 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10146 return NULL;
10147 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010149 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010151 if (!u)
10152 return NULL;
10153
10154 kind = PyUnicode_KIND(u);
10155 data = PyUnicode_DATA(u);
10156 if (left)
10157 FILL(kind, data, fill, 0, left);
10158 if (right)
10159 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010160 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010161 assert(_PyUnicode_CheckConsistency(u, 1));
10162 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163}
10164
Alexander Belopolsky40018472011-02-26 01:02:56 +000010165PyObject *
10166PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010170 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010171 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172
Benjamin Petersonead6b532011-12-20 17:23:42 -060010173 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010175 if (PyUnicode_IS_ASCII(string))
10176 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010177 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010178 PyUnicode_GET_LENGTH(string), keepends);
10179 else
10180 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010181 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010182 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 break;
10184 case PyUnicode_2BYTE_KIND:
10185 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010186 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 PyUnicode_GET_LENGTH(string), keepends);
10188 break;
10189 case PyUnicode_4BYTE_KIND:
10190 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010191 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 PyUnicode_GET_LENGTH(string), keepends);
10193 break;
10194 default:
10195 assert(0);
10196 list = 0;
10197 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199}
10200
Alexander Belopolsky40018472011-02-26 01:02:56 +000010201static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010202split(PyObject *self,
10203 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010204 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010206 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 void *buf1, *buf2;
10208 Py_ssize_t len1, len2;
10209 PyObject* out;
10210
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010212 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 if (PyUnicode_READY(self) == -1)
10215 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010218 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010220 if (PyUnicode_IS_ASCII(self))
10221 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010222 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010223 PyUnicode_GET_LENGTH(self), maxcount
10224 );
10225 else
10226 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010227 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010228 PyUnicode_GET_LENGTH(self), maxcount
10229 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 case PyUnicode_2BYTE_KIND:
10231 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010232 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 PyUnicode_GET_LENGTH(self), maxcount
10234 );
10235 case PyUnicode_4BYTE_KIND:
10236 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010237 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 PyUnicode_GET_LENGTH(self), maxcount
10239 );
10240 default:
10241 assert(0);
10242 return NULL;
10243 }
10244
10245 if (PyUnicode_READY(substring) == -1)
10246 return NULL;
10247
10248 kind1 = PyUnicode_KIND(self);
10249 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 len1 = PyUnicode_GET_LENGTH(self);
10251 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010252 if (kind1 < kind2 || len1 < len2) {
10253 out = PyList_New(1);
10254 if (out == NULL)
10255 return NULL;
10256 Py_INCREF(self);
10257 PyList_SET_ITEM(out, 0, self);
10258 return out;
10259 }
10260 buf1 = PyUnicode_DATA(self);
10261 buf2 = PyUnicode_DATA(substring);
10262 if (kind2 != kind1) {
10263 buf2 = _PyUnicode_AsKind(substring, kind1);
10264 if (!buf2)
10265 return NULL;
10266 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010268 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010270 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10271 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010272 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010273 else
10274 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010275 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 break;
10277 case PyUnicode_2BYTE_KIND:
10278 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010279 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 break;
10281 case PyUnicode_4BYTE_KIND:
10282 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010283 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 break;
10285 default:
10286 out = NULL;
10287 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010288 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 PyMem_Free(buf2);
10290 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291}
10292
Alexander Belopolsky40018472011-02-26 01:02:56 +000010293static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010294rsplit(PyObject *self,
10295 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010296 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010297{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010298 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 void *buf1, *buf2;
10300 Py_ssize_t len1, len2;
10301 PyObject* out;
10302
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010303 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010304 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 if (PyUnicode_READY(self) == -1)
10307 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010310 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010312 if (PyUnicode_IS_ASCII(self))
10313 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010314 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010315 PyUnicode_GET_LENGTH(self), maxcount
10316 );
10317 else
10318 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010319 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010320 PyUnicode_GET_LENGTH(self), maxcount
10321 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 case PyUnicode_2BYTE_KIND:
10323 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010324 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 PyUnicode_GET_LENGTH(self), maxcount
10326 );
10327 case PyUnicode_4BYTE_KIND:
10328 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010329 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 PyUnicode_GET_LENGTH(self), maxcount
10331 );
10332 default:
10333 assert(0);
10334 return NULL;
10335 }
10336
10337 if (PyUnicode_READY(substring) == -1)
10338 return NULL;
10339
10340 kind1 = PyUnicode_KIND(self);
10341 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 len1 = PyUnicode_GET_LENGTH(self);
10343 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010344 if (kind1 < kind2 || len1 < len2) {
10345 out = PyList_New(1);
10346 if (out == NULL)
10347 return NULL;
10348 Py_INCREF(self);
10349 PyList_SET_ITEM(out, 0, self);
10350 return out;
10351 }
10352 buf1 = PyUnicode_DATA(self);
10353 buf2 = PyUnicode_DATA(substring);
10354 if (kind2 != kind1) {
10355 buf2 = _PyUnicode_AsKind(substring, kind1);
10356 if (!buf2)
10357 return NULL;
10358 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010360 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010362 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10363 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010364 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010365 else
10366 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010367 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 break;
10369 case PyUnicode_2BYTE_KIND:
10370 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010371 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 break;
10373 case PyUnicode_4BYTE_KIND:
10374 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010375 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 break;
10377 default:
10378 out = NULL;
10379 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010380 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 PyMem_Free(buf2);
10382 return out;
10383}
10384
10385static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010386anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10387 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010389 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010391 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10392 return asciilib_find(buf1, len1, buf2, len2, offset);
10393 else
10394 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 case PyUnicode_2BYTE_KIND:
10396 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10397 case PyUnicode_4BYTE_KIND:
10398 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10399 }
10400 assert(0);
10401 return -1;
10402}
10403
10404static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010405anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10406 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010408 switch (kind) {
10409 case PyUnicode_1BYTE_KIND:
10410 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10411 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10412 else
10413 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10414 case PyUnicode_2BYTE_KIND:
10415 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10416 case PyUnicode_4BYTE_KIND:
10417 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10418 }
10419 assert(0);
10420 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010421}
10422
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010423static void
10424replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10425 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10426{
10427 int kind = PyUnicode_KIND(u);
10428 void *data = PyUnicode_DATA(u);
10429 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10430 if (kind == PyUnicode_1BYTE_KIND) {
10431 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10432 (Py_UCS1 *)data + len,
10433 u1, u2, maxcount);
10434 }
10435 else if (kind == PyUnicode_2BYTE_KIND) {
10436 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10437 (Py_UCS2 *)data + len,
10438 u1, u2, maxcount);
10439 }
10440 else {
10441 assert(kind == PyUnicode_4BYTE_KIND);
10442 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10443 (Py_UCS4 *)data + len,
10444 u1, u2, maxcount);
10445 }
10446}
10447
Alexander Belopolsky40018472011-02-26 01:02:56 +000010448static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449replace(PyObject *self, PyObject *str1,
10450 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 PyObject *u;
10453 char *sbuf = PyUnicode_DATA(self);
10454 char *buf1 = PyUnicode_DATA(str1);
10455 char *buf2 = PyUnicode_DATA(str2);
10456 int srelease = 0, release1 = 0, release2 = 0;
10457 int skind = PyUnicode_KIND(self);
10458 int kind1 = PyUnicode_KIND(str1);
10459 int kind2 = PyUnicode_KIND(str2);
10460 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10461 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10462 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010463 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010464 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010465
10466 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010467 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010469 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010470
Victor Stinner59de0ee2011-10-07 10:01:28 +020010471 if (str1 == str2)
10472 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473
Victor Stinner49a0a212011-10-12 23:46:10 +020010474 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010475 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10476 if (maxchar < maxchar_str1)
10477 /* substring too wide to be present */
10478 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010479 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10480 /* Replacing str1 with str2 may cause a maxchar reduction in the
10481 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010482 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010483 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010486 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010488 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010490 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010491 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010492 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010493
Victor Stinner69ed0f42013-04-09 21:48:24 +020010494 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010495 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010496 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010497 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010498 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010500 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010502
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010503 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10504 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010505 }
10506 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 int rkind = skind;
10508 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010509 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 if (kind1 < rkind) {
10512 /* widen substring */
10513 buf1 = _PyUnicode_AsKind(str1, rkind);
10514 if (!buf1) goto error;
10515 release1 = 1;
10516 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010517 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010518 if (i < 0)
10519 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 if (rkind > kind2) {
10521 /* widen replacement */
10522 buf2 = _PyUnicode_AsKind(str2, rkind);
10523 if (!buf2) goto error;
10524 release2 = 1;
10525 }
10526 else if (rkind < kind2) {
10527 /* widen self and buf1 */
10528 rkind = kind2;
10529 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010530 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 sbuf = _PyUnicode_AsKind(self, rkind);
10532 if (!sbuf) goto error;
10533 srelease = 1;
10534 buf1 = _PyUnicode_AsKind(str1, rkind);
10535 if (!buf1) goto error;
10536 release1 = 1;
10537 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010538 u = PyUnicode_New(slen, maxchar);
10539 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010541 assert(PyUnicode_KIND(u) == rkind);
10542 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010543
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010544 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010545 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010546 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010548 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010550
10551 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010552 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010553 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010554 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010555 if (i == -1)
10556 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010557 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010559 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010562 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010563 }
10564 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010566 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 int rkind = skind;
10568 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010571 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 buf1 = _PyUnicode_AsKind(str1, rkind);
10573 if (!buf1) goto error;
10574 release1 = 1;
10575 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010576 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010577 if (n == 0)
10578 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010580 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 buf2 = _PyUnicode_AsKind(str2, rkind);
10582 if (!buf2) goto error;
10583 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010584 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010586 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 rkind = kind2;
10588 sbuf = _PyUnicode_AsKind(self, rkind);
10589 if (!sbuf) goto error;
10590 srelease = 1;
10591 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010592 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 buf1 = _PyUnicode_AsKind(str1, rkind);
10594 if (!buf1) goto error;
10595 release1 = 1;
10596 }
10597 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10598 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010599 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 PyErr_SetString(PyExc_OverflowError,
10601 "replace string is too long");
10602 goto error;
10603 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010604 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010605 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010606 _Py_INCREF_UNICODE_EMPTY();
10607 if (!unicode_empty)
10608 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010609 u = unicode_empty;
10610 goto done;
10611 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010612 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 PyErr_SetString(PyExc_OverflowError,
10614 "replace string is too long");
10615 goto error;
10616 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010617 u = PyUnicode_New(new_size, maxchar);
10618 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010620 assert(PyUnicode_KIND(u) == rkind);
10621 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 ires = i = 0;
10623 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010624 while (n-- > 0) {
10625 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010626 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010627 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010628 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010629 if (j == -1)
10630 break;
10631 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010632 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010633 memcpy(res + rkind * ires,
10634 sbuf + rkind * i,
10635 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010637 }
10638 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010640 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010642 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010644 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010648 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010649 memcpy(res + rkind * ires,
10650 sbuf + rkind * i,
10651 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010652 }
10653 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010654 /* interleave */
10655 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010656 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010658 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010660 if (--n <= 0)
10661 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010662 memcpy(res + rkind * ires,
10663 sbuf + rkind * i,
10664 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 ires++;
10666 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010667 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010668 memcpy(res + rkind * ires,
10669 sbuf + rkind * i,
10670 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010671 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010672 }
10673
10674 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010675 unicode_adjust_maxchar(&u);
10676 if (u == NULL)
10677 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010679
10680 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 if (srelease)
10682 PyMem_FREE(sbuf);
10683 if (release1)
10684 PyMem_FREE(buf1);
10685 if (release2)
10686 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010687 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010689
Benjamin Peterson29060642009-01-31 22:14:21 +000010690 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010691 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 if (srelease)
10693 PyMem_FREE(sbuf);
10694 if (release1)
10695 PyMem_FREE(buf1);
10696 if (release2)
10697 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010698 return unicode_result_unchanged(self);
10699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 error:
10701 if (srelease && sbuf)
10702 PyMem_FREE(sbuf);
10703 if (release1 && buf1)
10704 PyMem_FREE(buf1);
10705 if (release2 && buf2)
10706 PyMem_FREE(buf2);
10707 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708}
10709
10710/* --- Unicode Object Methods --------------------------------------------- */
10711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010712PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010713 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714\n\
10715Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010716characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717
10718static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010719unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010721 if (PyUnicode_READY(self) == -1)
10722 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010723 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724}
10725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010726PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010727 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728\n\
10729Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010730have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731
10732static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010733unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010735 if (PyUnicode_READY(self) == -1)
10736 return NULL;
10737 if (PyUnicode_GET_LENGTH(self) == 0)
10738 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010739 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740}
10741
Benjamin Petersond5890c82012-01-14 13:23:30 -050010742PyDoc_STRVAR(casefold__doc__,
10743 "S.casefold() -> str\n\
10744\n\
10745Return a version of S suitable for caseless comparisons.");
10746
10747static PyObject *
10748unicode_casefold(PyObject *self)
10749{
10750 if (PyUnicode_READY(self) == -1)
10751 return NULL;
10752 if (PyUnicode_IS_ASCII(self))
10753 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010754 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010755}
10756
10757
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010758/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010759
10760static int
10761convert_uc(PyObject *obj, void *addr)
10762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010764
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010765 if (!PyUnicode_Check(obj)) {
10766 PyErr_Format(PyExc_TypeError,
10767 "The fill character must be a unicode character, "
10768 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010769 return 0;
10770 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010771 if (PyUnicode_READY(obj) < 0)
10772 return 0;
10773 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010774 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010775 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010776 return 0;
10777 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010778 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010779 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010780}
10781
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010782PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010783 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010785Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010786done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787
10788static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010789unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010790{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010791 Py_ssize_t marg, left;
10792 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 Py_UCS4 fillchar = ' ';
10794
Victor Stinnere9a29352011-10-01 02:14:59 +020010795 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797
Benjamin Petersonbac79492012-01-14 13:34:47 -050010798 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010799 return NULL;
10800
Victor Stinnerc4b49542011-12-11 22:44:26 +010010801 if (PyUnicode_GET_LENGTH(self) >= width)
10802 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010803
Victor Stinnerc4b49542011-12-11 22:44:26 +010010804 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805 left = marg / 2 + (marg & width & 1);
10806
Victor Stinner9310abb2011-10-05 00:59:23 +020010807 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808}
10809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810/* This function assumes that str1 and str2 are readied by the caller. */
10811
Marc-André Lemburge5034372000-08-08 08:04:29 +000010812static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010813unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010814{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010815#define COMPARE(TYPE1, TYPE2) \
10816 do { \
10817 TYPE1* p1 = (TYPE1 *)data1; \
10818 TYPE2* p2 = (TYPE2 *)data2; \
10819 TYPE1* end = p1 + len; \
10820 Py_UCS4 c1, c2; \
10821 for (; p1 != end; p1++, p2++) { \
10822 c1 = *p1; \
10823 c2 = *p2; \
10824 if (c1 != c2) \
10825 return (c1 < c2) ? -1 : 1; \
10826 } \
10827 } \
10828 while (0)
10829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010830 int kind1, kind2;
10831 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010832 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 kind1 = PyUnicode_KIND(str1);
10835 kind2 = PyUnicode_KIND(str2);
10836 data1 = PyUnicode_DATA(str1);
10837 data2 = PyUnicode_DATA(str2);
10838 len1 = PyUnicode_GET_LENGTH(str1);
10839 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010840 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010841
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010842 switch(kind1) {
10843 case PyUnicode_1BYTE_KIND:
10844 {
10845 switch(kind2) {
10846 case PyUnicode_1BYTE_KIND:
10847 {
10848 int cmp = memcmp(data1, data2, len);
10849 /* normalize result of memcmp() into the range [-1; 1] */
10850 if (cmp < 0)
10851 return -1;
10852 if (cmp > 0)
10853 return 1;
10854 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010855 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010856 case PyUnicode_2BYTE_KIND:
10857 COMPARE(Py_UCS1, Py_UCS2);
10858 break;
10859 case PyUnicode_4BYTE_KIND:
10860 COMPARE(Py_UCS1, Py_UCS4);
10861 break;
10862 default:
10863 assert(0);
10864 }
10865 break;
10866 }
10867 case PyUnicode_2BYTE_KIND:
10868 {
10869 switch(kind2) {
10870 case PyUnicode_1BYTE_KIND:
10871 COMPARE(Py_UCS2, Py_UCS1);
10872 break;
10873 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010874 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010875 COMPARE(Py_UCS2, Py_UCS2);
10876 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010877 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010878 case PyUnicode_4BYTE_KIND:
10879 COMPARE(Py_UCS2, Py_UCS4);
10880 break;
10881 default:
10882 assert(0);
10883 }
10884 break;
10885 }
10886 case PyUnicode_4BYTE_KIND:
10887 {
10888 switch(kind2) {
10889 case PyUnicode_1BYTE_KIND:
10890 COMPARE(Py_UCS4, Py_UCS1);
10891 break;
10892 case PyUnicode_2BYTE_KIND:
10893 COMPARE(Py_UCS4, Py_UCS2);
10894 break;
10895 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010896 {
10897#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10898 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10899 /* normalize result of wmemcmp() into the range [-1; 1] */
10900 if (cmp < 0)
10901 return -1;
10902 if (cmp > 0)
10903 return 1;
10904#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010905 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010906#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010907 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010908 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010909 default:
10910 assert(0);
10911 }
10912 break;
10913 }
10914 default:
10915 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010916 }
10917
Victor Stinner770e19e2012-10-04 22:59:45 +020010918 if (len1 == len2)
10919 return 0;
10920 if (len1 < len2)
10921 return -1;
10922 else
10923 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010924
10925#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010926}
10927
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010928Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010929unicode_compare_eq(PyObject *str1, PyObject *str2)
10930{
10931 int kind;
10932 void *data1, *data2;
10933 Py_ssize_t len;
10934 int cmp;
10935
Victor Stinnere5567ad2012-10-23 02:48:49 +020010936 len = PyUnicode_GET_LENGTH(str1);
10937 if (PyUnicode_GET_LENGTH(str2) != len)
10938 return 0;
10939 kind = PyUnicode_KIND(str1);
10940 if (PyUnicode_KIND(str2) != kind)
10941 return 0;
10942 data1 = PyUnicode_DATA(str1);
10943 data2 = PyUnicode_DATA(str2);
10944
10945 cmp = memcmp(data1, data2, len * kind);
10946 return (cmp == 0);
10947}
10948
10949
Alexander Belopolsky40018472011-02-26 01:02:56 +000010950int
10951PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10954 if (PyUnicode_READY(left) == -1 ||
10955 PyUnicode_READY(right) == -1)
10956 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010957
10958 /* a string is equal to itself */
10959 if (left == right)
10960 return 0;
10961
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010962 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010964 PyErr_Format(PyExc_TypeError,
10965 "Can't compare %.100s and %.100s",
10966 left->ob_type->tp_name,
10967 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968 return -1;
10969}
10970
Martin v. Löwis5b222132007-06-10 09:51:05 +000010971int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010972_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10973{
10974 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10975 if (right_str == NULL)
10976 return -1;
10977 return PyUnicode_Compare(left, right_str);
10978}
10979
10980int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010981PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10982{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010983 Py_ssize_t i;
10984 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010985 Py_UCS4 chr;
10986
Victor Stinner910337b2011-10-03 03:20:16 +020010987 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 if (PyUnicode_READY(uni) == -1)
10989 return -1;
10990 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010991 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010992 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010993 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010994 size_t len, len2 = strlen(str);
10995 int cmp;
10996
10997 len = Py_MIN(len1, len2);
10998 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010999 if (cmp != 0) {
11000 if (cmp < 0)
11001 return -1;
11002 else
11003 return 1;
11004 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011005 if (len1 > len2)
11006 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011007 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011008 return -1; /* str is longer */
11009 return 0;
11010 }
11011 else {
11012 void *data = PyUnicode_DATA(uni);
11013 /* Compare Unicode string and source character set string */
11014 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011015 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011016 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11017 /* This check keeps Python strings that end in '\0' from comparing equal
11018 to C strings identical up to that point. */
11019 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11020 return 1; /* uni is longer */
11021 if (str[i])
11022 return -1; /* str is longer */
11023 return 0;
11024 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011025}
11026
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011027
Benjamin Peterson29060642009-01-31 22:14:21 +000011028#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011029 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011030
Alexander Belopolsky40018472011-02-26 01:02:56 +000011031PyObject *
11032PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011033{
11034 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011035 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011036
Victor Stinnere5567ad2012-10-23 02:48:49 +020011037 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11038 Py_RETURN_NOTIMPLEMENTED;
11039
11040 if (PyUnicode_READY(left) == -1 ||
11041 PyUnicode_READY(right) == -1)
11042 return NULL;
11043
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011044 if (left == right) {
11045 switch (op) {
11046 case Py_EQ:
11047 case Py_LE:
11048 case Py_GE:
11049 /* a string is equal to itself */
11050 v = Py_True;
11051 break;
11052 case Py_NE:
11053 case Py_LT:
11054 case Py_GT:
11055 v = Py_False;
11056 break;
11057 default:
11058 PyErr_BadArgument();
11059 return NULL;
11060 }
11061 }
11062 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011063 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011064 result ^= (op == Py_NE);
11065 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011066 }
11067 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011068 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011069
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011070 /* Convert the return value to a Boolean */
11071 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011072 case Py_LE:
11073 v = TEST_COND(result <= 0);
11074 break;
11075 case Py_GE:
11076 v = TEST_COND(result >= 0);
11077 break;
11078 case Py_LT:
11079 v = TEST_COND(result == -1);
11080 break;
11081 case Py_GT:
11082 v = TEST_COND(result == 1);
11083 break;
11084 default:
11085 PyErr_BadArgument();
11086 return NULL;
11087 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011088 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011089 Py_INCREF(v);
11090 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011091}
11092
Alexander Belopolsky40018472011-02-26 01:02:56 +000011093int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011094_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11095{
11096 return unicode_eq(aa, bb);
11097}
11098
11099int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011100PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011101{
Victor Stinner77282cb2013-04-14 19:22:47 +020011102 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011103 void *buf1, *buf2;
11104 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011105 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011106
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011107 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011108 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011109 "'in <string>' requires string as left operand, not %.100s",
11110 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011111 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011112 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011113 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011114 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011115 if (ensure_unicode(str) < 0)
11116 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011118 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011119 kind2 = PyUnicode_KIND(substr);
11120 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011121 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011122 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011123 len2 = PyUnicode_GET_LENGTH(substr);
11124 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011125 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011126 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011127 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011128 if (len2 == 1) {
11129 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11130 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011131 return result;
11132 }
11133 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011134 buf2 = _PyUnicode_AsKind(substr, kind1);
11135 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011136 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138
Victor Stinner77282cb2013-04-14 19:22:47 +020011139 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011140 case PyUnicode_1BYTE_KIND:
11141 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11142 break;
11143 case PyUnicode_2BYTE_KIND:
11144 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11145 break;
11146 case PyUnicode_4BYTE_KIND:
11147 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11148 break;
11149 default:
11150 result = -1;
11151 assert(0);
11152 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011153
Victor Stinner77282cb2013-04-14 19:22:47 +020011154 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 PyMem_Free(buf2);
11156
Guido van Rossum403d68b2000-03-13 15:55:09 +000011157 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011158}
11159
Guido van Rossumd57fd912000-03-10 22:53:23 +000011160/* Concat to string or Unicode object giving a new Unicode object. */
11161
Alexander Belopolsky40018472011-02-26 01:02:56 +000011162PyObject *
11163PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011165 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011166 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011167 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011169 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171
11172 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011173 if (left == unicode_empty)
11174 return PyUnicode_FromObject(right);
11175 if (right == unicode_empty)
11176 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011178 left_len = PyUnicode_GET_LENGTH(left);
11179 right_len = PyUnicode_GET_LENGTH(right);
11180 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011181 PyErr_SetString(PyExc_OverflowError,
11182 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011183 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011184 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011185 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011186
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011187 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11188 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011189 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011192 result = PyUnicode_New(new_len, maxchar);
11193 if (result == NULL)
11194 return NULL;
11195 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11196 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11197 assert(_PyUnicode_CheckConsistency(result, 1));
11198 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199}
11200
Walter Dörwald1ab83302007-05-18 17:15:44 +000011201void
Victor Stinner23e56682011-10-03 03:54:37 +020011202PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011203{
Victor Stinner23e56682011-10-03 03:54:37 +020011204 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011205 Py_UCS4 maxchar, maxchar2;
11206 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011207
11208 if (p_left == NULL) {
11209 if (!PyErr_Occurred())
11210 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011211 return;
11212 }
Victor Stinner23e56682011-10-03 03:54:37 +020011213 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011214 if (right == NULL || left == NULL
11215 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011216 if (!PyErr_Occurred())
11217 PyErr_BadInternalCall();
11218 goto error;
11219 }
11220
Benjamin Petersonbac79492012-01-14 13:34:47 -050011221 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011222 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011223 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011224 goto error;
11225
Victor Stinner488fa492011-12-12 00:01:39 +010011226 /* Shortcuts */
11227 if (left == unicode_empty) {
11228 Py_DECREF(left);
11229 Py_INCREF(right);
11230 *p_left = right;
11231 return;
11232 }
11233 if (right == unicode_empty)
11234 return;
11235
11236 left_len = PyUnicode_GET_LENGTH(left);
11237 right_len = PyUnicode_GET_LENGTH(right);
11238 if (left_len > PY_SSIZE_T_MAX - right_len) {
11239 PyErr_SetString(PyExc_OverflowError,
11240 "strings are too large to concat");
11241 goto error;
11242 }
11243 new_len = left_len + right_len;
11244
11245 if (unicode_modifiable(left)
11246 && PyUnicode_CheckExact(right)
11247 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011248 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11249 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011250 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011251 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011252 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11253 {
11254 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011255 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011256 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011257
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011258 /* copy 'right' into the newly allocated area of 'left' */
11259 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011260 }
Victor Stinner488fa492011-12-12 00:01:39 +010011261 else {
11262 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11263 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011264 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011265
Victor Stinner488fa492011-12-12 00:01:39 +010011266 /* Concat the two Unicode strings */
11267 res = PyUnicode_New(new_len, maxchar);
11268 if (res == NULL)
11269 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011270 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11271 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011272 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011273 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011274 }
11275 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011276 return;
11277
11278error:
Victor Stinner488fa492011-12-12 00:01:39 +010011279 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011280}
11281
11282void
11283PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11284{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011285 PyUnicode_Append(pleft, right);
11286 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011287}
11288
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011289/*
11290Wraps stringlib_parse_args_finds() and additionally ensures that the
11291first argument is a unicode object.
11292*/
11293
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011294static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011295parse_args_finds_unicode(const char * function_name, PyObject *args,
11296 PyObject **substring,
11297 Py_ssize_t *start, Py_ssize_t *end)
11298{
11299 if(stringlib_parse_args_finds(function_name, args, substring,
11300 start, end)) {
11301 if (ensure_unicode(*substring) < 0)
11302 return 0;
11303 return 1;
11304 }
11305 return 0;
11306}
11307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011308PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011309 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011311Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011312string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011313interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314
11315static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011316unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011318 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011319 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011320 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011322 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011323 void *buf1, *buf2;
11324 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011326 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011327 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 kind1 = PyUnicode_KIND(self);
11330 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011331 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011332 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 len1 = PyUnicode_GET_LENGTH(self);
11335 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011337 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011338 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011339
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011340 buf1 = PyUnicode_DATA(self);
11341 buf2 = PyUnicode_DATA(substring);
11342 if (kind2 != kind1) {
11343 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011344 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011345 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011346 }
11347 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 case PyUnicode_1BYTE_KIND:
11349 iresult = ucs1lib_count(
11350 ((Py_UCS1*)buf1) + start, end - start,
11351 buf2, len2, PY_SSIZE_T_MAX
11352 );
11353 break;
11354 case PyUnicode_2BYTE_KIND:
11355 iresult = ucs2lib_count(
11356 ((Py_UCS2*)buf1) + start, end - start,
11357 buf2, len2, PY_SSIZE_T_MAX
11358 );
11359 break;
11360 case PyUnicode_4BYTE_KIND:
11361 iresult = ucs4lib_count(
11362 ((Py_UCS4*)buf1) + start, end - start,
11363 buf2, len2, PY_SSIZE_T_MAX
11364 );
11365 break;
11366 default:
11367 assert(0); iresult = 0;
11368 }
11369
11370 result = PyLong_FromSsize_t(iresult);
11371
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011372 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375 return result;
11376}
11377
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011378PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011379 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011381Encode S using the codec registered for encoding. Default encoding\n\
11382is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011383handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011384a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11385'xmlcharrefreplace' as well as any other name registered with\n\
11386codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387
11388static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011389unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011391 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392 char *encoding = NULL;
11393 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011394
Benjamin Peterson308d6372009-09-18 21:42:35 +000011395 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11396 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011398 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011399}
11400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011401PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011402 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403\n\
11404Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011405If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406
11407static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011408unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011410 Py_ssize_t i, j, line_pos, src_len, incr;
11411 Py_UCS4 ch;
11412 PyObject *u;
11413 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011414 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011416 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011417 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418
Ezio Melotti745d54d2013-11-16 19:10:57 +020011419 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11420 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011421 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422
Antoine Pitrou22425222011-10-04 19:10:51 +020011423 if (PyUnicode_READY(self) == -1)
11424 return NULL;
11425
Thomas Wouters7e474022000-07-16 12:04:32 +000011426 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011427 src_len = PyUnicode_GET_LENGTH(self);
11428 i = j = line_pos = 0;
11429 kind = PyUnicode_KIND(self);
11430 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011431 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011432 for (; i < src_len; i++) {
11433 ch = PyUnicode_READ(kind, src_data, i);
11434 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011435 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011436 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011437 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011438 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011439 goto overflow;
11440 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011441 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011442 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011443 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011445 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011446 goto overflow;
11447 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011449 if (ch == '\n' || ch == '\r')
11450 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011452 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011453 if (!found)
11454 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011455
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011457 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458 if (!u)
11459 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011460 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
Antoine Pitroue71d5742011-10-04 15:55:09 +020011462 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463
Antoine Pitroue71d5742011-10-04 15:55:09 +020011464 for (; i < src_len; i++) {
11465 ch = PyUnicode_READ(kind, src_data, i);
11466 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011467 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011468 incr = tabsize - (line_pos % tabsize);
11469 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011470 FILL(kind, dest_data, ' ', j, incr);
11471 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011472 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011473 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011475 line_pos++;
11476 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011477 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011478 if (ch == '\n' || ch == '\r')
11479 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011481 }
11482 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011483 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011484
Antoine Pitroue71d5742011-10-04 15:55:09 +020011485 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011486 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11487 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488}
11489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011490PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492\n\
11493Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011494such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495arguments start and end are interpreted as in slice notation.\n\
11496\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011497Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498
11499static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011502 /* initialize variables to prevent gcc warning */
11503 PyObject *substring = NULL;
11504 Py_ssize_t start = 0;
11505 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011506 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011508 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011511 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011514 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011516 if (result == -2)
11517 return NULL;
11518
Christian Heimes217cfd12007-12-02 14:31:20 +000011519 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520}
11521
11522static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011523unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011525 void *data;
11526 enum PyUnicode_Kind kind;
11527 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011528
11529 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11530 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011532 }
11533 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11534 PyErr_SetString(PyExc_IndexError, "string index out of range");
11535 return NULL;
11536 }
11537 kind = PyUnicode_KIND(self);
11538 data = PyUnicode_DATA(self);
11539 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011540 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541}
11542
Guido van Rossumc2504932007-09-18 19:42:40 +000011543/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011544 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011545static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011546unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547{
Guido van Rossumc2504932007-09-18 19:42:40 +000011548 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011549 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011550
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011551#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011552 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011553#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 if (_PyUnicode_HASH(self) != -1)
11555 return _PyUnicode_HASH(self);
11556 if (PyUnicode_READY(self) == -1)
11557 return -1;
11558 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011559 /*
11560 We make the hash of the empty string be 0, rather than using
11561 (prefix ^ suffix), since this slightly obfuscates the hash secret
11562 */
11563 if (len == 0) {
11564 _PyUnicode_HASH(self) = 0;
11565 return 0;
11566 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011567 x = _Py_HashBytes(PyUnicode_DATA(self),
11568 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011570 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571}
11572
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011573PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011574 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011576Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577
11578static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011581 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011582 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011583 PyObject *substring = NULL;
11584 Py_ssize_t start = 0;
11585 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011587 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011590 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011593 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011595 if (result == -2)
11596 return NULL;
11597
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598 if (result < 0) {
11599 PyErr_SetString(PyExc_ValueError, "substring not found");
11600 return NULL;
11601 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011602
Christian Heimes217cfd12007-12-02 14:31:20 +000011603 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604}
11605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011606PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011607 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011609Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011610at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611
11612static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011613unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615 Py_ssize_t i, length;
11616 int kind;
11617 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618 int cased;
11619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 if (PyUnicode_READY(self) == -1)
11621 return NULL;
11622 length = PyUnicode_GET_LENGTH(self);
11623 kind = PyUnicode_KIND(self);
11624 data = PyUnicode_DATA(self);
11625
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 if (length == 1)
11628 return PyBool_FromLong(
11629 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011631 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011634
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636 for (i = 0; i < length; i++) {
11637 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011638
Benjamin Peterson29060642009-01-31 22:14:21 +000011639 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11640 return PyBool_FromLong(0);
11641 else if (!cased && Py_UNICODE_ISLOWER(ch))
11642 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011644 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645}
11646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011647PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011648 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011650Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011651at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652
11653static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011654unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 Py_ssize_t i, length;
11657 int kind;
11658 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659 int cased;
11660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 if (PyUnicode_READY(self) == -1)
11662 return NULL;
11663 length = PyUnicode_GET_LENGTH(self);
11664 kind = PyUnicode_KIND(self);
11665 data = PyUnicode_DATA(self);
11666
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011668 if (length == 1)
11669 return PyBool_FromLong(
11670 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011672 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011674 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011675
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 for (i = 0; i < length; i++) {
11678 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011679
Benjamin Peterson29060642009-01-31 22:14:21 +000011680 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11681 return PyBool_FromLong(0);
11682 else if (!cased && Py_UNICODE_ISUPPER(ch))
11683 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011685 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686}
11687
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011688PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011689 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011691Return True if S is a titlecased string and there is at least one\n\
11692character in S, i.e. upper- and titlecase characters may only\n\
11693follow uncased characters and lowercase characters only cased ones.\n\
11694Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695
11696static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011697unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 Py_ssize_t i, length;
11700 int kind;
11701 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702 int cased, previous_is_cased;
11703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 if (PyUnicode_READY(self) == -1)
11705 return NULL;
11706 length = PyUnicode_GET_LENGTH(self);
11707 kind = PyUnicode_KIND(self);
11708 data = PyUnicode_DATA(self);
11709
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 if (length == 1) {
11712 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11713 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11714 (Py_UNICODE_ISUPPER(ch) != 0));
11715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011717 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011718 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011719 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011720
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721 cased = 0;
11722 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 for (i = 0; i < length; i++) {
11724 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011725
Benjamin Peterson29060642009-01-31 22:14:21 +000011726 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11727 if (previous_is_cased)
11728 return PyBool_FromLong(0);
11729 previous_is_cased = 1;
11730 cased = 1;
11731 }
11732 else if (Py_UNICODE_ISLOWER(ch)) {
11733 if (!previous_is_cased)
11734 return PyBool_FromLong(0);
11735 previous_is_cased = 1;
11736 cased = 1;
11737 }
11738 else
11739 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011741 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742}
11743
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011744PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011747Return True if all characters in S are whitespace\n\
11748and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749
11750static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011751unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 Py_ssize_t i, length;
11754 int kind;
11755 void *data;
11756
11757 if (PyUnicode_READY(self) == -1)
11758 return NULL;
11759 length = PyUnicode_GET_LENGTH(self);
11760 kind = PyUnicode_KIND(self);
11761 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 if (length == 1)
11765 return PyBool_FromLong(
11766 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011768 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011770 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 for (i = 0; i < length; i++) {
11773 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011774 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011775 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011777 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778}
11779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011780PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011781 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011782\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011783Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011784and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011785
11786static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011787unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 Py_ssize_t i, length;
11790 int kind;
11791 void *data;
11792
11793 if (PyUnicode_READY(self) == -1)
11794 return NULL;
11795 length = PyUnicode_GET_LENGTH(self);
11796 kind = PyUnicode_KIND(self);
11797 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011798
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011799 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 if (length == 1)
11801 return PyBool_FromLong(
11802 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011803
11804 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011806 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 for (i = 0; i < length; i++) {
11809 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011811 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011812 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011813}
11814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011815PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011816 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011817\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011818Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011819and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011820
11821static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011822unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011823{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 int kind;
11825 void *data;
11826 Py_ssize_t len, i;
11827
11828 if (PyUnicode_READY(self) == -1)
11829 return NULL;
11830
11831 kind = PyUnicode_KIND(self);
11832 data = PyUnicode_DATA(self);
11833 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011834
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011835 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 if (len == 1) {
11837 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11838 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11839 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011840
11841 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011843 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 for (i = 0; i < len; i++) {
11846 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011847 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011849 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011850 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011851}
11852
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011853PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011854 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011856Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011857False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858
11859static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011860unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 Py_ssize_t i, length;
11863 int kind;
11864 void *data;
11865
11866 if (PyUnicode_READY(self) == -1)
11867 return NULL;
11868 length = PyUnicode_GET_LENGTH(self);
11869 kind = PyUnicode_KIND(self);
11870 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 if (length == 1)
11874 return PyBool_FromLong(
11875 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011877 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011879 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 for (i = 0; i < length; i++) {
11882 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011885 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886}
11887
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011888PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011889 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011891Return True if all characters in S are digits\n\
11892and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893
11894static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011895unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 Py_ssize_t i, length;
11898 int kind;
11899 void *data;
11900
11901 if (PyUnicode_READY(self) == -1)
11902 return NULL;
11903 length = PyUnicode_GET_LENGTH(self);
11904 kind = PyUnicode_KIND(self);
11905 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 if (length == 1) {
11909 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11910 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11911 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011913 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011915 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 for (i = 0; i < length; i++) {
11918 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011919 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011921 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922}
11923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011924PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011925 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011927Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011928False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929
11930static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011931unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 Py_ssize_t i, length;
11934 int kind;
11935 void *data;
11936
11937 if (PyUnicode_READY(self) == -1)
11938 return NULL;
11939 length = PyUnicode_GET_LENGTH(self);
11940 kind = PyUnicode_KIND(self);
11941 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 if (length == 1)
11945 return PyBool_FromLong(
11946 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011948 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011949 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011950 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 for (i = 0; i < length; i++) {
11953 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011956 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957}
11958
Martin v. Löwis47383402007-08-15 07:32:56 +000011959int
11960PyUnicode_IsIdentifier(PyObject *self)
11961{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 int kind;
11963 void *data;
11964 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011965 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 if (PyUnicode_READY(self) == -1) {
11968 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011969 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 }
11971
11972 /* Special case for empty strings */
11973 if (PyUnicode_GET_LENGTH(self) == 0)
11974 return 0;
11975 kind = PyUnicode_KIND(self);
11976 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011977
11978 /* PEP 3131 says that the first character must be in
11979 XID_Start and subsequent characters in XID_Continue,
11980 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011981 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011982 letters, digits, underscore). However, given the current
11983 definition of XID_Start and XID_Continue, it is sufficient
11984 to check just for these, except that _ must be allowed
11985 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011987 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011988 return 0;
11989
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011990 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011993 return 1;
11994}
11995
11996PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011998\n\
11999Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012000to the language definition.\n\
12001\n\
12002Use keyword.iskeyword() to test for reserved identifiers\n\
12003such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012004
12005static PyObject*
12006unicode_isidentifier(PyObject *self)
12007{
12008 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12009}
12010
Georg Brandl559e5d72008-06-11 18:37:52 +000012011PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012012 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012013\n\
12014Return True if all characters in S are considered\n\
12015printable in repr() or S is empty, False otherwise.");
12016
12017static PyObject*
12018unicode_isprintable(PyObject *self)
12019{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 Py_ssize_t i, length;
12021 int kind;
12022 void *data;
12023
12024 if (PyUnicode_READY(self) == -1)
12025 return NULL;
12026 length = PyUnicode_GET_LENGTH(self);
12027 kind = PyUnicode_KIND(self);
12028 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012029
12030 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 if (length == 1)
12032 return PyBool_FromLong(
12033 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 for (i = 0; i < length; i++) {
12036 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012037 Py_RETURN_FALSE;
12038 }
12039 }
12040 Py_RETURN_TRUE;
12041}
12042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012043PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012044 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045\n\
12046Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012047iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048
12049static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012050unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012052 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053}
12054
Martin v. Löwis18e16552006-02-15 17:27:45 +000012055static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012056unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 if (PyUnicode_READY(self) == -1)
12059 return -1;
12060 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061}
12062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012063PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012064 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012066Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012067done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068
12069static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012070unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012072 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073 Py_UCS4 fillchar = ' ';
12074
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012075 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076 return NULL;
12077
Benjamin Petersonbac79492012-01-14 13:34:47 -050012078 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012079 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080
Victor Stinnerc4b49542011-12-11 22:44:26 +010012081 if (PyUnicode_GET_LENGTH(self) >= width)
12082 return unicode_result_unchanged(self);
12083
12084 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085}
12086
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012087PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012088 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012090Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091
12092static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012093unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012095 if (PyUnicode_READY(self) == -1)
12096 return NULL;
12097 if (PyUnicode_IS_ASCII(self))
12098 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012099 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100}
12101
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012102#define LEFTSTRIP 0
12103#define RIGHTSTRIP 1
12104#define BOTHSTRIP 2
12105
12106/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012107static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012108
12109#define STRIPNAME(i) (stripformat[i]+3)
12110
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012111/* externally visible for str.strip(unicode) */
12112PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012113_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012114{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 void *data;
12116 int kind;
12117 Py_ssize_t i, j, len;
12118 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012119 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12122 return NULL;
12123
12124 kind = PyUnicode_KIND(self);
12125 data = PyUnicode_DATA(self);
12126 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012127 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12129 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012130 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012131
Benjamin Peterson14339b62009-01-31 16:36:08 +000012132 i = 0;
12133 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012134 while (i < len) {
12135 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12136 if (!BLOOM(sepmask, ch))
12137 break;
12138 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12139 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012140 i++;
12141 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012142 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012143
Benjamin Peterson14339b62009-01-31 16:36:08 +000012144 j = len;
12145 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012146 j--;
12147 while (j >= i) {
12148 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12149 if (!BLOOM(sepmask, ch))
12150 break;
12151 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12152 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012153 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012154 }
12155
Benjamin Peterson29060642009-01-31 22:14:21 +000012156 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012157 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012158
Victor Stinner7931d9a2011-11-04 00:22:48 +010012159 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012160}
12161
12162PyObject*
12163PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12164{
12165 unsigned char *data;
12166 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012167 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012168
Victor Stinnerde636f32011-10-01 03:55:54 +020012169 if (PyUnicode_READY(self) == -1)
12170 return NULL;
12171
Victor Stinner684d5fd2012-05-03 02:32:34 +020012172 length = PyUnicode_GET_LENGTH(self);
12173 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012174
Victor Stinner684d5fd2012-05-03 02:32:34 +020012175 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012176 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177
Victor Stinnerde636f32011-10-01 03:55:54 +020012178 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012179 PyErr_SetString(PyExc_IndexError, "string index out of range");
12180 return NULL;
12181 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012182 if (start >= length || end < start)
12183 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012184
Victor Stinner684d5fd2012-05-03 02:32:34 +020012185 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012186 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012187 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012188 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012189 }
12190 else {
12191 kind = PyUnicode_KIND(self);
12192 data = PyUnicode_1BYTE_DATA(self);
12193 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012194 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012195 length);
12196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198
12199static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012200do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 Py_ssize_t len, i, j;
12203
12204 if (PyUnicode_READY(self) == -1)
12205 return NULL;
12206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012208
Victor Stinnercc7af722013-04-09 22:39:24 +020012209 if (PyUnicode_IS_ASCII(self)) {
12210 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12211
12212 i = 0;
12213 if (striptype != RIGHTSTRIP) {
12214 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012215 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012216 if (!_Py_ascii_whitespace[ch])
12217 break;
12218 i++;
12219 }
12220 }
12221
12222 j = len;
12223 if (striptype != LEFTSTRIP) {
12224 j--;
12225 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012226 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012227 if (!_Py_ascii_whitespace[ch])
12228 break;
12229 j--;
12230 }
12231 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012232 }
12233 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012234 else {
12235 int kind = PyUnicode_KIND(self);
12236 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012237
Victor Stinnercc7af722013-04-09 22:39:24 +020012238 i = 0;
12239 if (striptype != RIGHTSTRIP) {
12240 while (i < len) {
12241 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12242 if (!Py_UNICODE_ISSPACE(ch))
12243 break;
12244 i++;
12245 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012246 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012247
12248 j = len;
12249 if (striptype != LEFTSTRIP) {
12250 j--;
12251 while (j >= i) {
12252 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12253 if (!Py_UNICODE_ISSPACE(ch))
12254 break;
12255 j--;
12256 }
12257 j++;
12258 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012259 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012260
Victor Stinner7931d9a2011-11-04 00:22:48 +010012261 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262}
12263
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012264
12265static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012266do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012267{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012268 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012269
Serhiy Storchakac6792272013-10-19 21:03:34 +030012270 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012271 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012272
Benjamin Peterson14339b62009-01-31 16:36:08 +000012273 if (sep != NULL && sep != Py_None) {
12274 if (PyUnicode_Check(sep))
12275 return _PyUnicode_XStrip(self, striptype, sep);
12276 else {
12277 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012278 "%s arg must be None or str",
12279 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012280 return NULL;
12281 }
12282 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012283
Benjamin Peterson14339b62009-01-31 16:36:08 +000012284 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012285}
12286
12287
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012288PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012289 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012290\n\
12291Return a copy of the string S with leading and trailing\n\
12292whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012293If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012294
12295static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012296unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012297{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012298 if (PyTuple_GET_SIZE(args) == 0)
12299 return do_strip(self, BOTHSTRIP); /* Common case */
12300 else
12301 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012302}
12303
12304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012305PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012306 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012307\n\
12308Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012309If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012310
12311static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012312unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012313{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012314 if (PyTuple_GET_SIZE(args) == 0)
12315 return do_strip(self, LEFTSTRIP); /* Common case */
12316 else
12317 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012318}
12319
12320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012321PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012322 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012323\n\
12324Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012325If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012326
12327static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012328unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012329{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012330 if (PyTuple_GET_SIZE(args) == 0)
12331 return do_strip(self, RIGHTSTRIP); /* Common case */
12332 else
12333 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012334}
12335
12336
Guido van Rossumd57fd912000-03-10 22:53:23 +000012337static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012338unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012339{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012340 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012342
Serhiy Storchaka05997252013-01-26 12:14:02 +020012343 if (len < 1)
12344 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345
Victor Stinnerc4b49542011-12-11 22:44:26 +010012346 /* no repeat, return original string */
12347 if (len == 1)
12348 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012349
Benjamin Petersonbac79492012-01-14 13:34:47 -050012350 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 return NULL;
12352
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012353 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012354 PyErr_SetString(PyExc_OverflowError,
12355 "repeated string is too long");
12356 return NULL;
12357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012359
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012360 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361 if (!u)
12362 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012363 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365 if (PyUnicode_GET_LENGTH(str) == 1) {
12366 const int kind = PyUnicode_KIND(str);
12367 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012368 if (kind == PyUnicode_1BYTE_KIND) {
12369 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012370 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012371 }
12372 else if (kind == PyUnicode_2BYTE_KIND) {
12373 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012374 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012375 ucs2[n] = fill_char;
12376 } else {
12377 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12378 assert(kind == PyUnicode_4BYTE_KIND);
12379 for (n = 0; n < len; ++n)
12380 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 }
12383 else {
12384 /* number of characters copied this far */
12385 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012386 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 char *to = (char *) PyUnicode_DATA(u);
12388 Py_MEMCPY(to, PyUnicode_DATA(str),
12389 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012390 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391 n = (done <= nchars-done) ? done : nchars-done;
12392 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012393 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012394 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012395 }
12396
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012397 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012398 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012399}
12400
Alexander Belopolsky40018472011-02-26 01:02:56 +000012401PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012402PyUnicode_Replace(PyObject *str,
12403 PyObject *substr,
12404 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012405 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012406{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012407 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12408 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012409 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012410 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411}
12412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012413PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012414 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012415\n\
12416Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012417old replaced by new. If the optional argument count is\n\
12418given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012419
12420static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012421unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 PyObject *str1;
12424 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012425 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012426
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012427 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012428 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012429 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012430 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012431 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012432}
12433
Alexander Belopolsky40018472011-02-26 01:02:56 +000012434static PyObject *
12435unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012436{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012437 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012438 Py_ssize_t isize;
12439 Py_ssize_t osize, squote, dquote, i, o;
12440 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012441 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012442 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012445 return NULL;
12446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 isize = PyUnicode_GET_LENGTH(unicode);
12448 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012450 /* Compute length of output, quote characters, and
12451 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012452 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 max = 127;
12454 squote = dquote = 0;
12455 ikind = PyUnicode_KIND(unicode);
12456 for (i = 0; i < isize; i++) {
12457 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012458 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012460 case '\'': squote++; break;
12461 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012463 incr = 2;
12464 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 default:
12466 /* Fast-path ASCII */
12467 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012468 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012469 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012470 ;
12471 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012474 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012476 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012478 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012480 if (osize > PY_SSIZE_T_MAX - incr) {
12481 PyErr_SetString(PyExc_OverflowError,
12482 "string is too long to generate repr");
12483 return NULL;
12484 }
12485 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486 }
12487
12488 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012489 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012491 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 if (dquote)
12493 /* Both squote and dquote present. Use squote,
12494 and escape them */
12495 osize += squote;
12496 else
12497 quote = '"';
12498 }
Victor Stinner55c08782013-04-14 18:45:39 +020012499 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500
12501 repr = PyUnicode_New(osize, max);
12502 if (repr == NULL)
12503 return NULL;
12504 okind = PyUnicode_KIND(repr);
12505 odata = PyUnicode_DATA(repr);
12506
12507 PyUnicode_WRITE(okind, odata, 0, quote);
12508 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012509 if (unchanged) {
12510 _PyUnicode_FastCopyCharacters(repr, 1,
12511 unicode, 0,
12512 isize);
12513 }
12514 else {
12515 for (i = 0, o = 1; i < isize; i++) {
12516 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517
Victor Stinner55c08782013-04-14 18:45:39 +020012518 /* Escape quotes and backslashes */
12519 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012520 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012522 continue;
12523 }
12524
12525 /* Map special whitespace to '\t', \n', '\r' */
12526 if (ch == '\t') {
12527 PyUnicode_WRITE(okind, odata, o++, '\\');
12528 PyUnicode_WRITE(okind, odata, o++, 't');
12529 }
12530 else if (ch == '\n') {
12531 PyUnicode_WRITE(okind, odata, o++, '\\');
12532 PyUnicode_WRITE(okind, odata, o++, 'n');
12533 }
12534 else if (ch == '\r') {
12535 PyUnicode_WRITE(okind, odata, o++, '\\');
12536 PyUnicode_WRITE(okind, odata, o++, 'r');
12537 }
12538
12539 /* Map non-printable US ASCII to '\xhh' */
12540 else if (ch < ' ' || ch == 0x7F) {
12541 PyUnicode_WRITE(okind, odata, o++, '\\');
12542 PyUnicode_WRITE(okind, odata, o++, 'x');
12543 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12544 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12545 }
12546
12547 /* Copy ASCII characters as-is */
12548 else if (ch < 0x7F) {
12549 PyUnicode_WRITE(okind, odata, o++, ch);
12550 }
12551
12552 /* Non-ASCII characters */
12553 else {
12554 /* Map Unicode whitespace and control characters
12555 (categories Z* and C* except ASCII space)
12556 */
12557 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12558 PyUnicode_WRITE(okind, odata, o++, '\\');
12559 /* Map 8-bit characters to '\xhh' */
12560 if (ch <= 0xff) {
12561 PyUnicode_WRITE(okind, odata, o++, 'x');
12562 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12563 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12564 }
12565 /* Map 16-bit characters to '\uxxxx' */
12566 else if (ch <= 0xffff) {
12567 PyUnicode_WRITE(okind, odata, o++, 'u');
12568 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12569 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12570 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12571 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12572 }
12573 /* Map 21-bit characters to '\U00xxxxxx' */
12574 else {
12575 PyUnicode_WRITE(okind, odata, o++, 'U');
12576 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12577 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12578 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12579 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12580 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12581 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12582 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12583 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12584 }
12585 }
12586 /* Copy characters as-is */
12587 else {
12588 PyUnicode_WRITE(okind, odata, o++, ch);
12589 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012590 }
12591 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012592 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012594 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012595 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596}
12597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012598PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012599 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012600\n\
12601Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012602such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603arguments start and end are interpreted as in slice notation.\n\
12604\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012605Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012606
12607static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012610 /* initialize variables to prevent gcc warning */
12611 PyObject *substring = NULL;
12612 Py_ssize_t start = 0;
12613 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012614 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012615
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012616 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012617 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012619 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012620 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012621
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012622 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012624 if (result == -2)
12625 return NULL;
12626
Christian Heimes217cfd12007-12-02 14:31:20 +000012627 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628}
12629
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012630PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012631 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012633Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634
12635static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012636unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012638 /* initialize variables to prevent gcc warning */
12639 PyObject *substring = NULL;
12640 Py_ssize_t start = 0;
12641 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012642 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012644 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012645 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012647 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012650 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 if (result == -2)
12653 return NULL;
12654
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655 if (result < 0) {
12656 PyErr_SetString(PyExc_ValueError, "substring not found");
12657 return NULL;
12658 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659
Christian Heimes217cfd12007-12-02 14:31:20 +000012660 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661}
12662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012663PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012664 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012665\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012666Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012667done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012668
12669static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012670unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012672 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 Py_UCS4 fillchar = ' ';
12674
Victor Stinnere9a29352011-10-01 02:14:59 +020012675 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012677
Benjamin Petersonbac79492012-01-14 13:34:47 -050012678 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679 return NULL;
12680
Victor Stinnerc4b49542011-12-11 22:44:26 +010012681 if (PyUnicode_GET_LENGTH(self) >= width)
12682 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012683
Victor Stinnerc4b49542011-12-11 22:44:26 +010012684 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685}
12686
Alexander Belopolsky40018472011-02-26 01:02:56 +000012687PyObject *
12688PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012690 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012691 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012693 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694}
12695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012696PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012697 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698\n\
12699Return a list of the words in S, using sep as the\n\
12700delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012701splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012702whitespace string is a separator and empty strings are\n\
12703removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704
12705static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012706unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012707{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012708 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012710 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012711
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012712 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12713 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714 return NULL;
12715
12716 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012717 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012718
12719 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012720 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012721
12722 PyErr_Format(PyExc_TypeError,
12723 "must be str or None, not %.100s",
12724 Py_TYPE(substring)->tp_name);
12725 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012726}
12727
Thomas Wouters477c8d52006-05-27 19:21:47 +000012728PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012729PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012730{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012731 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012732 int kind1, kind2;
12733 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012735
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012736 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012737 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012738
Victor Stinner14f8f022011-10-05 20:58:25 +020012739 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012740 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 len1 = PyUnicode_GET_LENGTH(str_obj);
12742 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012743 if (kind1 < kind2 || len1 < len2) {
12744 _Py_INCREF_UNICODE_EMPTY();
12745 if (!unicode_empty)
12746 out = NULL;
12747 else {
12748 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12749 Py_DECREF(unicode_empty);
12750 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012751 return out;
12752 }
12753 buf1 = PyUnicode_DATA(str_obj);
12754 buf2 = PyUnicode_DATA(sep_obj);
12755 if (kind2 != kind1) {
12756 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12757 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012758 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012760
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012761 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012762 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012763 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12764 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12765 else
12766 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012767 break;
12768 case PyUnicode_2BYTE_KIND:
12769 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12770 break;
12771 case PyUnicode_4BYTE_KIND:
12772 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12773 break;
12774 default:
12775 assert(0);
12776 out = 0;
12777 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012778
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012779 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012781
12782 return out;
12783}
12784
12785
12786PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012787PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012788{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012789 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012790 int kind1, kind2;
12791 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012792 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012793
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012794 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012795 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012796
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012797 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012799 len1 = PyUnicode_GET_LENGTH(str_obj);
12800 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012801 if (kind1 < kind2 || len1 < len2) {
12802 _Py_INCREF_UNICODE_EMPTY();
12803 if (!unicode_empty)
12804 out = NULL;
12805 else {
12806 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12807 Py_DECREF(unicode_empty);
12808 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012809 return out;
12810 }
12811 buf1 = PyUnicode_DATA(str_obj);
12812 buf2 = PyUnicode_DATA(sep_obj);
12813 if (kind2 != kind1) {
12814 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12815 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012816 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012817 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012819 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012820 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012821 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12822 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12823 else
12824 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825 break;
12826 case PyUnicode_2BYTE_KIND:
12827 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12828 break;
12829 case PyUnicode_4BYTE_KIND:
12830 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12831 break;
12832 default:
12833 assert(0);
12834 out = 0;
12835 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012836
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012837 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012838 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012839
12840 return out;
12841}
12842
12843PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012844 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012845\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012846Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012847the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012848found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012849
12850static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012851unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012852{
Victor Stinner9310abb2011-10-05 00:59:23 +020012853 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012854}
12855
12856PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012857 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012858\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012859Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012860the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012861separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012862
12863static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012864unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012865{
Victor Stinner9310abb2011-10-05 00:59:23 +020012866 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012867}
12868
Alexander Belopolsky40018472011-02-26 01:02:56 +000012869PyObject *
12870PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012871{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012872 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012873 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012874
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012875 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012876}
12877
12878PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012879 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012880\n\
12881Return a list of the words in S, using sep as the\n\
12882delimiter string, starting at the end of the string and\n\
12883working to the front. If maxsplit is given, at most maxsplit\n\
12884splits are done. If sep is not specified, any whitespace string\n\
12885is a separator.");
12886
12887static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012888unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012889{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012890 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012891 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012892 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012893
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012894 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12895 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012896 return NULL;
12897
12898 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012899 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012900
12901 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012902 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012903
12904 PyErr_Format(PyExc_TypeError,
12905 "must be str or None, not %.100s",
12906 Py_TYPE(substring)->tp_name);
12907 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012908}
12909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012910PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012911 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012912\n\
12913Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012914Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012915is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012916
12917static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012918unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012919{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012920 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012921 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012922
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012923 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12924 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012925 return NULL;
12926
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012927 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012928}
12929
12930static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012931PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012933 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012934}
12935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012936PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012937 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012938\n\
12939Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012940and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012941
12942static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012943unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012944{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012945 if (PyUnicode_READY(self) == -1)
12946 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012947 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948}
12949
Larry Hastings61272b72014-01-07 12:41:53 -080012950/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012951
Larry Hastings31826802013-10-19 00:09:25 -070012952@staticmethod
12953str.maketrans as unicode_maketrans
12954
12955 x: object
12956
12957 y: unicode=NULL
12958
12959 z: unicode=NULL
12960
12961 /
12962
12963Return a translation table usable for str.translate().
12964
12965If there is only one argument, it must be a dictionary mapping Unicode
12966ordinals (integers) or characters to Unicode ordinals, strings or None.
12967Character keys will be then converted to ordinals.
12968If there are two arguments, they must be strings of equal length, and
12969in the resulting dictionary, each character in x will be mapped to the
12970character at the same position in y. If there is a third argument, it
12971must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012972[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012973
Larry Hastings31826802013-10-19 00:09:25 -070012974static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012975unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030012976/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012977{
Georg Brandlceee0772007-11-27 23:48:05 +000012978 PyObject *new = NULL, *key, *value;
12979 Py_ssize_t i = 0;
12980 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012981
Georg Brandlceee0772007-11-27 23:48:05 +000012982 new = PyDict_New();
12983 if (!new)
12984 return NULL;
12985 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 int x_kind, y_kind, z_kind;
12987 void *x_data, *y_data, *z_data;
12988
Georg Brandlceee0772007-11-27 23:48:05 +000012989 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012990 if (!PyUnicode_Check(x)) {
12991 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12992 "be a string if there is a second argument");
12993 goto err;
12994 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012995 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012996 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12997 "arguments must have equal length");
12998 goto err;
12999 }
13000 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013001 x_kind = PyUnicode_KIND(x);
13002 y_kind = PyUnicode_KIND(y);
13003 x_data = PyUnicode_DATA(x);
13004 y_data = PyUnicode_DATA(y);
13005 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13006 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013007 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013008 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013009 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013010 if (!value) {
13011 Py_DECREF(key);
13012 goto err;
13013 }
Georg Brandlceee0772007-11-27 23:48:05 +000013014 res = PyDict_SetItem(new, key, value);
13015 Py_DECREF(key);
13016 Py_DECREF(value);
13017 if (res < 0)
13018 goto err;
13019 }
13020 /* create entries for deleting chars in z */
13021 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013022 z_kind = PyUnicode_KIND(z);
13023 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013024 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013026 if (!key)
13027 goto err;
13028 res = PyDict_SetItem(new, key, Py_None);
13029 Py_DECREF(key);
13030 if (res < 0)
13031 goto err;
13032 }
13033 }
13034 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013035 int kind;
13036 void *data;
13037
Georg Brandlceee0772007-11-27 23:48:05 +000013038 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013039 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013040 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13041 "to maketrans it must be a dict");
13042 goto err;
13043 }
13044 /* copy entries into the new dict, converting string keys to int keys */
13045 while (PyDict_Next(x, &i, &key, &value)) {
13046 if (PyUnicode_Check(key)) {
13047 /* convert string keys to integer keys */
13048 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013049 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013050 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13051 "table must be of length 1");
13052 goto err;
13053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013054 kind = PyUnicode_KIND(key);
13055 data = PyUnicode_DATA(key);
13056 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013057 if (!newkey)
13058 goto err;
13059 res = PyDict_SetItem(new, newkey, value);
13060 Py_DECREF(newkey);
13061 if (res < 0)
13062 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013063 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013064 /* just keep integer keys */
13065 if (PyDict_SetItem(new, key, value) < 0)
13066 goto err;
13067 } else {
13068 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13069 "be strings or integers");
13070 goto err;
13071 }
13072 }
13073 }
13074 return new;
13075 err:
13076 Py_DECREF(new);
13077 return NULL;
13078}
13079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013080PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013081 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013083Return a copy of the string S in which each character has been mapped\n\
13084through the given translation table. The table must implement\n\
13085lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13086mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13087this operation raises LookupError, the character is left untouched.\n\
13088Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089
13090static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013091unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013093 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094}
13095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013096PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013097 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013099Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100
13101static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013102unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013104 if (PyUnicode_READY(self) == -1)
13105 return NULL;
13106 if (PyUnicode_IS_ASCII(self))
13107 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013108 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109}
13110
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013111PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013112 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013114Pad a numeric string S with zeros on the left, to fill a field\n\
13115of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116
13117static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013118unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013120 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013121 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013122 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013123 int kind;
13124 void *data;
13125 Py_UCS4 chr;
13126
Martin v. Löwis18e16552006-02-15 17:27:45 +000013127 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128 return NULL;
13129
Benjamin Petersonbac79492012-01-14 13:34:47 -050013130 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013131 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132
Victor Stinnerc4b49542011-12-11 22:44:26 +010013133 if (PyUnicode_GET_LENGTH(self) >= width)
13134 return unicode_result_unchanged(self);
13135
13136 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013137
13138 u = pad(self, fill, 0, '0');
13139
Walter Dörwald068325e2002-04-15 13:36:47 +000013140 if (u == NULL)
13141 return NULL;
13142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013143 kind = PyUnicode_KIND(u);
13144 data = PyUnicode_DATA(u);
13145 chr = PyUnicode_READ(kind, data, fill);
13146
13147 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013148 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149 PyUnicode_WRITE(kind, data, 0, chr);
13150 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151 }
13152
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013153 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013154 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013156
13157#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013158static PyObject *
13159unicode__decimal2ascii(PyObject *self)
13160{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013161 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013162}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013163#endif
13164
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013165PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013166 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013167\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013168Return True if S starts with the specified prefix, False otherwise.\n\
13169With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013170With optional end, stop comparing S at that position.\n\
13171prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013172
13173static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013174unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013175 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013176{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013177 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013178 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013179 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013180 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013181 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013182
Jesus Ceaac451502011-04-20 17:09:23 +020013183 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013184 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013185 if (PyTuple_Check(subobj)) {
13186 Py_ssize_t i;
13187 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013188 substring = PyTuple_GET_ITEM(subobj, i);
13189 if (!PyUnicode_Check(substring)) {
13190 PyErr_Format(PyExc_TypeError,
13191 "tuple for startswith must only contain str, "
13192 "not %.100s",
13193 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013194 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013195 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013196 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013197 if (result == -1)
13198 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013199 if (result) {
13200 Py_RETURN_TRUE;
13201 }
13202 }
13203 /* nothing matched */
13204 Py_RETURN_FALSE;
13205 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013206 if (!PyUnicode_Check(subobj)) {
13207 PyErr_Format(PyExc_TypeError,
13208 "startswith first arg must be str or "
13209 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013210 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013211 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013212 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013213 if (result == -1)
13214 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013215 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013216}
13217
13218
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013219PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013220 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013221\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013222Return True if S ends with the specified suffix, False otherwise.\n\
13223With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013224With optional end, stop comparing S at that position.\n\
13225suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013226
13227static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013228unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013229 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013230{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013231 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013232 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013233 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013234 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013235 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013236
Jesus Ceaac451502011-04-20 17:09:23 +020013237 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013238 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013239 if (PyTuple_Check(subobj)) {
13240 Py_ssize_t i;
13241 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013242 substring = PyTuple_GET_ITEM(subobj, i);
13243 if (!PyUnicode_Check(substring)) {
13244 PyErr_Format(PyExc_TypeError,
13245 "tuple for endswith must only contain str, "
13246 "not %.100s",
13247 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013248 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013249 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013250 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013251 if (result == -1)
13252 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013253 if (result) {
13254 Py_RETURN_TRUE;
13255 }
13256 }
13257 Py_RETURN_FALSE;
13258 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013259 if (!PyUnicode_Check(subobj)) {
13260 PyErr_Format(PyExc_TypeError,
13261 "endswith first arg must be str or "
13262 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013263 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013264 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013265 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013266 if (result == -1)
13267 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013268 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013269}
13270
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013271static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013272_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013273{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013274 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13275 writer->data = PyUnicode_DATA(writer->buffer);
13276
13277 if (!writer->readonly) {
13278 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013279 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013280 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013281 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013282 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13283 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13284 writer->kind = PyUnicode_WCHAR_KIND;
13285 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13286
Victor Stinner8f674cc2013-04-17 23:02:17 +020013287 /* Copy-on-write mode: set buffer size to 0 so
13288 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13289 * next write. */
13290 writer->size = 0;
13291 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013292}
13293
Victor Stinnerd3f08822012-05-29 12:57:52 +020013294void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013295_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013296{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013297 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013298
13299 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013300 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013301
13302 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13303 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13304 writer->kind = PyUnicode_WCHAR_KIND;
13305 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013306}
13307
Victor Stinnerd3f08822012-05-29 12:57:52 +020013308int
13309_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13310 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013311{
13312 Py_ssize_t newlen;
13313 PyObject *newbuffer;
13314
Victor Stinner2740e462016-09-06 16:58:36 -070013315 assert(maxchar <= MAX_UNICODE);
13316
Victor Stinnerca9381e2015-09-22 00:58:32 +020013317 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013318 assert((maxchar > writer->maxchar && length >= 0)
13319 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013320
Victor Stinner202fdca2012-05-07 12:47:02 +020013321 if (length > PY_SSIZE_T_MAX - writer->pos) {
13322 PyErr_NoMemory();
13323 return -1;
13324 }
13325 newlen = writer->pos + length;
13326
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013327 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013328
Victor Stinnerd3f08822012-05-29 12:57:52 +020013329 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013330 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013331 if (writer->overallocate
13332 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13333 /* overallocate to limit the number of realloc() */
13334 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013335 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013336 if (newlen < writer->min_length)
13337 newlen = writer->min_length;
13338
Victor Stinnerd3f08822012-05-29 12:57:52 +020013339 writer->buffer = PyUnicode_New(newlen, maxchar);
13340 if (writer->buffer == NULL)
13341 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013342 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013343 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013344 if (writer->overallocate
13345 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13346 /* overallocate to limit the number of realloc() */
13347 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013348 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013349 if (newlen < writer->min_length)
13350 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013351
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013352 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013353 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013354 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013355 newbuffer = PyUnicode_New(newlen, maxchar);
13356 if (newbuffer == NULL)
13357 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013358 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13359 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013360 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013361 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013362 }
13363 else {
13364 newbuffer = resize_compact(writer->buffer, newlen);
13365 if (newbuffer == NULL)
13366 return -1;
13367 }
13368 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013369 }
13370 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013371 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013372 newbuffer = PyUnicode_New(writer->size, maxchar);
13373 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013374 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013375 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13376 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013377 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013378 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013379 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013380 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013381
13382#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013383}
13384
Victor Stinnerca9381e2015-09-22 00:58:32 +020013385int
13386_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13387 enum PyUnicode_Kind kind)
13388{
13389 Py_UCS4 maxchar;
13390
13391 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13392 assert(writer->kind < kind);
13393
13394 switch (kind)
13395 {
13396 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13397 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13398 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13399 default:
13400 assert(0 && "invalid kind");
13401 return -1;
13402 }
13403
13404 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13405}
13406
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013407static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013408_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013409{
Victor Stinner2740e462016-09-06 16:58:36 -070013410 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013411 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13412 return -1;
13413 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13414 writer->pos++;
13415 return 0;
13416}
13417
13418int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013419_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13420{
13421 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13422}
13423
13424int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013425_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13426{
13427 Py_UCS4 maxchar;
13428 Py_ssize_t len;
13429
13430 if (PyUnicode_READY(str) == -1)
13431 return -1;
13432 len = PyUnicode_GET_LENGTH(str);
13433 if (len == 0)
13434 return 0;
13435 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13436 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013437 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013438 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013439 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013440 Py_INCREF(str);
13441 writer->buffer = str;
13442 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013443 writer->pos += len;
13444 return 0;
13445 }
13446 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13447 return -1;
13448 }
13449 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13450 str, 0, len);
13451 writer->pos += len;
13452 return 0;
13453}
13454
Victor Stinnere215d962012-10-06 23:03:36 +020013455int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013456_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13457 Py_ssize_t start, Py_ssize_t end)
13458{
13459 Py_UCS4 maxchar;
13460 Py_ssize_t len;
13461
13462 if (PyUnicode_READY(str) == -1)
13463 return -1;
13464
13465 assert(0 <= start);
13466 assert(end <= PyUnicode_GET_LENGTH(str));
13467 assert(start <= end);
13468
13469 if (end == 0)
13470 return 0;
13471
13472 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13473 return _PyUnicodeWriter_WriteStr(writer, str);
13474
13475 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13476 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13477 else
13478 maxchar = writer->maxchar;
13479 len = end - start;
13480
13481 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13482 return -1;
13483
13484 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13485 str, start, len);
13486 writer->pos += len;
13487 return 0;
13488}
13489
13490int
Victor Stinner4a587072013-11-19 12:54:53 +010013491_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13492 const char *ascii, Py_ssize_t len)
13493{
13494 if (len == -1)
13495 len = strlen(ascii);
13496
13497 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13498
13499 if (writer->buffer == NULL && !writer->overallocate) {
13500 PyObject *str;
13501
13502 str = _PyUnicode_FromASCII(ascii, len);
13503 if (str == NULL)
13504 return -1;
13505
13506 writer->readonly = 1;
13507 writer->buffer = str;
13508 _PyUnicodeWriter_Update(writer);
13509 writer->pos += len;
13510 return 0;
13511 }
13512
13513 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13514 return -1;
13515
13516 switch (writer->kind)
13517 {
13518 case PyUnicode_1BYTE_KIND:
13519 {
13520 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13521 Py_UCS1 *data = writer->data;
13522
13523 Py_MEMCPY(data + writer->pos, str, len);
13524 break;
13525 }
13526 case PyUnicode_2BYTE_KIND:
13527 {
13528 _PyUnicode_CONVERT_BYTES(
13529 Py_UCS1, Py_UCS2,
13530 ascii, ascii + len,
13531 (Py_UCS2 *)writer->data + writer->pos);
13532 break;
13533 }
13534 case PyUnicode_4BYTE_KIND:
13535 {
13536 _PyUnicode_CONVERT_BYTES(
13537 Py_UCS1, Py_UCS4,
13538 ascii, ascii + len,
13539 (Py_UCS4 *)writer->data + writer->pos);
13540 break;
13541 }
13542 default:
13543 assert(0);
13544 }
13545
13546 writer->pos += len;
13547 return 0;
13548}
13549
13550int
13551_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13552 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013553{
13554 Py_UCS4 maxchar;
13555
13556 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13557 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13558 return -1;
13559 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13560 writer->pos += len;
13561 return 0;
13562}
13563
Victor Stinnerd3f08822012-05-29 12:57:52 +020013564PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013565_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013566{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013567 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013568 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013569 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013570 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013571 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013572 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013573 str = writer->buffer;
13574 writer->buffer = NULL;
13575 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13576 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013577 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013578 if (writer->pos == 0) {
13579 Py_CLEAR(writer->buffer);
13580
13581 /* Get the empty Unicode string singleton ('') */
13582 _Py_INCREF_UNICODE_EMPTY();
13583 str = unicode_empty;
Victor Stinner202fdca2012-05-07 12:47:02 +020013584 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013585 else {
13586 str = writer->buffer;
13587 writer->buffer = NULL;
13588
13589 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13590 PyObject *str2;
13591 str2 = resize_compact(str, writer->pos);
13592 if (str2 == NULL)
13593 return NULL;
13594 str = str2;
13595 }
13596 }
13597
Victor Stinner15a0bd32013-07-08 22:29:55 +020013598 assert(_PyUnicode_CheckConsistency(str, 1));
13599 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013600}
13601
Victor Stinnerd3f08822012-05-29 12:57:52 +020013602void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013603_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013604{
13605 Py_CLEAR(writer->buffer);
13606}
13607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013608#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013609
13610PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013611 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013612\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013613Return a formatted version of S, using substitutions from args and kwargs.\n\
13614The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013615
Eric Smith27bbca62010-11-04 17:06:58 +000013616PyDoc_STRVAR(format_map__doc__,
13617 "S.format_map(mapping) -> str\n\
13618\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013619Return a formatted version of S, using substitutions from mapping.\n\
13620The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013621
Eric Smith4a7d76d2008-05-30 18:10:19 +000013622static PyObject *
13623unicode__format__(PyObject* self, PyObject* args)
13624{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013625 PyObject *format_spec;
13626 _PyUnicodeWriter writer;
13627 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013628
13629 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13630 return NULL;
13631
Victor Stinnerd3f08822012-05-29 12:57:52 +020013632 if (PyUnicode_READY(self) == -1)
13633 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013634 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013635 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13636 self, format_spec, 0,
13637 PyUnicode_GET_LENGTH(format_spec));
13638 if (ret == -1) {
13639 _PyUnicodeWriter_Dealloc(&writer);
13640 return NULL;
13641 }
13642 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013643}
13644
Eric Smith8c663262007-08-25 02:26:07 +000013645PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013646 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013647\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013648Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013649
13650static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013651unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013652{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013653 Py_ssize_t size;
13654
13655 /* If it's a compact object, account for base structure +
13656 character data. */
13657 if (PyUnicode_IS_COMPACT_ASCII(v))
13658 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13659 else if (PyUnicode_IS_COMPACT(v))
13660 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013661 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013662 else {
13663 /* If it is a two-block object, account for base object, and
13664 for character block if present. */
13665 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013666 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013667 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013668 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013669 }
13670 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013671 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013672 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013673 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013674 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013675 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013676
13677 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013678}
13679
13680PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013681 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013682
13683static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013684unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013685{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013686 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013687 if (!copy)
13688 return NULL;
13689 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013690}
13691
Guido van Rossumd57fd912000-03-10 22:53:23 +000013692static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013693 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013694 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013695 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13696 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013697 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13698 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013699 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013700 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13701 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13702 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013703 {"expandtabs", (PyCFunction) unicode_expandtabs,
13704 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013705 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013706 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013707 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13708 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13709 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013710 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013711 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13712 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13713 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013714 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013715 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013716 {"splitlines", (PyCFunction) unicode_splitlines,
13717 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013718 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013719 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13720 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13721 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13722 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13723 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13724 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13725 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13726 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13727 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13728 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13729 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13730 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13731 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13732 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013733 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013734 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013735 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013736 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013737 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013738 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013739 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013740 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013741#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013742 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013743 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013744#endif
13745
Benjamin Peterson14339b62009-01-31 16:36:08 +000013746 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013747 {NULL, NULL}
13748};
13749
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013750static PyObject *
13751unicode_mod(PyObject *v, PyObject *w)
13752{
Brian Curtindfc80e32011-08-10 20:28:54 -050013753 if (!PyUnicode_Check(v))
13754 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013755 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013756}
13757
13758static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013759 0, /*nb_add*/
13760 0, /*nb_subtract*/
13761 0, /*nb_multiply*/
13762 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013763};
13764
Guido van Rossumd57fd912000-03-10 22:53:23 +000013765static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013766 (lenfunc) unicode_length, /* sq_length */
13767 PyUnicode_Concat, /* sq_concat */
13768 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13769 (ssizeargfunc) unicode_getitem, /* sq_item */
13770 0, /* sq_slice */
13771 0, /* sq_ass_item */
13772 0, /* sq_ass_slice */
13773 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013774};
13775
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013776static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013777unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013778{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013779 if (PyUnicode_READY(self) == -1)
13780 return NULL;
13781
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013782 if (PyIndex_Check(item)) {
13783 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013784 if (i == -1 && PyErr_Occurred())
13785 return NULL;
13786 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013787 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013788 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013789 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013790 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013791 PyObject *result;
13792 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013793 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013794 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013796 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013797 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013798 return NULL;
13799 }
13800
13801 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013802 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013803 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013804 slicelength == PyUnicode_GET_LENGTH(self)) {
13805 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013806 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013807 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013808 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013809 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013810 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013811 src_kind = PyUnicode_KIND(self);
13812 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013813 if (!PyUnicode_IS_ASCII(self)) {
13814 kind_limit = kind_maxchar_limit(src_kind);
13815 max_char = 0;
13816 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13817 ch = PyUnicode_READ(src_kind, src_data, cur);
13818 if (ch > max_char) {
13819 max_char = ch;
13820 if (max_char >= kind_limit)
13821 break;
13822 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013823 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013824 }
Victor Stinner55c99112011-10-13 01:17:06 +020013825 else
13826 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013827 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013828 if (result == NULL)
13829 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013830 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013831 dest_data = PyUnicode_DATA(result);
13832
13833 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013834 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13835 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013836 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013837 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013838 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013839 } else {
13840 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13841 return NULL;
13842 }
13843}
13844
13845static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013846 (lenfunc)unicode_length, /* mp_length */
13847 (binaryfunc)unicode_subscript, /* mp_subscript */
13848 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013849};
13850
Guido van Rossumd57fd912000-03-10 22:53:23 +000013851
Guido van Rossumd57fd912000-03-10 22:53:23 +000013852/* Helpers for PyUnicode_Format() */
13853
Victor Stinnera47082312012-10-04 02:19:54 +020013854struct unicode_formatter_t {
13855 PyObject *args;
13856 int args_owned;
13857 Py_ssize_t arglen, argidx;
13858 PyObject *dict;
13859
13860 enum PyUnicode_Kind fmtkind;
13861 Py_ssize_t fmtcnt, fmtpos;
13862 void *fmtdata;
13863 PyObject *fmtstr;
13864
13865 _PyUnicodeWriter writer;
13866};
13867
13868struct unicode_format_arg_t {
13869 Py_UCS4 ch;
13870 int flags;
13871 Py_ssize_t width;
13872 int prec;
13873 int sign;
13874};
13875
Guido van Rossumd57fd912000-03-10 22:53:23 +000013876static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013877unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013878{
Victor Stinnera47082312012-10-04 02:19:54 +020013879 Py_ssize_t argidx = ctx->argidx;
13880
13881 if (argidx < ctx->arglen) {
13882 ctx->argidx++;
13883 if (ctx->arglen < 0)
13884 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013885 else
Victor Stinnera47082312012-10-04 02:19:54 +020013886 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013887 }
13888 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013889 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013890 return NULL;
13891}
13892
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013893/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013894
Victor Stinnera47082312012-10-04 02:19:54 +020013895/* Format a float into the writer if the writer is not NULL, or into *p_output
13896 otherwise.
13897
13898 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013899static int
Victor Stinnera47082312012-10-04 02:19:54 +020013900formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13901 PyObject **p_output,
13902 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013903{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013904 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013905 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013906 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013907 int prec;
13908 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013909
Guido van Rossumd57fd912000-03-10 22:53:23 +000013910 x = PyFloat_AsDouble(v);
13911 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013912 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013913
Victor Stinnera47082312012-10-04 02:19:54 +020013914 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013915 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013916 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013917
Victor Stinnera47082312012-10-04 02:19:54 +020013918 if (arg->flags & F_ALT)
13919 dtoa_flags = Py_DTSF_ALT;
13920 else
13921 dtoa_flags = 0;
13922 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013923 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013924 return -1;
13925 len = strlen(p);
13926 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013927 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013928 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013929 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013930 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013931 }
13932 else
13933 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013934 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013935 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013936}
13937
Victor Stinnerd0880d52012-04-27 23:40:13 +020013938/* formatlong() emulates the format codes d, u, o, x and X, and
13939 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13940 * Python's regular ints.
13941 * Return value: a new PyUnicodeObject*, or NULL if error.
13942 * The output string is of the form
13943 * "-"? ("0x" | "0X")? digit+
13944 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13945 * set in flags. The case of hex digits will be correct,
13946 * There will be at least prec digits, zero-filled on the left if
13947 * necessary to get that many.
13948 * val object to be converted
13949 * flags bitmask of format flags; only F_ALT is looked at
13950 * prec minimum number of digits; 0-fill on left if needed
13951 * type a character in [duoxX]; u acts the same as d
13952 *
13953 * CAUTION: o, x and X conversions on regular ints can never
13954 * produce a '-' sign, but can for Python's unbounded ints.
13955 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013956PyObject *
13957_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013958{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013959 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013960 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013961 Py_ssize_t i;
13962 int sign; /* 1 if '-', else 0 */
13963 int len; /* number of characters */
13964 Py_ssize_t llen;
13965 int numdigits; /* len == numnondigits + numdigits */
13966 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013967
Victor Stinnerd0880d52012-04-27 23:40:13 +020013968 /* Avoid exceeding SSIZE_T_MAX */
13969 if (prec > INT_MAX-3) {
13970 PyErr_SetString(PyExc_OverflowError,
13971 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013972 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013973 }
13974
13975 assert(PyLong_Check(val));
13976
13977 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013978 default:
13979 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013980 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013981 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013982 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013983 /* int and int subclasses should print numerically when a numeric */
13984 /* format code is used (see issue18780) */
13985 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013986 break;
13987 case 'o':
13988 numnondigits = 2;
13989 result = PyNumber_ToBase(val, 8);
13990 break;
13991 case 'x':
13992 case 'X':
13993 numnondigits = 2;
13994 result = PyNumber_ToBase(val, 16);
13995 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013996 }
13997 if (!result)
13998 return NULL;
13999
14000 assert(unicode_modifiable(result));
14001 assert(PyUnicode_IS_READY(result));
14002 assert(PyUnicode_IS_ASCII(result));
14003
14004 /* To modify the string in-place, there can only be one reference. */
14005 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014006 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014007 PyErr_BadInternalCall();
14008 return NULL;
14009 }
14010 buf = PyUnicode_DATA(result);
14011 llen = PyUnicode_GET_LENGTH(result);
14012 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014013 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014014 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014015 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014016 return NULL;
14017 }
14018 len = (int)llen;
14019 sign = buf[0] == '-';
14020 numnondigits += sign;
14021 numdigits = len - numnondigits;
14022 assert(numdigits > 0);
14023
14024 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014025 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014026 (type == 'o' || type == 'x' || type == 'X'))) {
14027 assert(buf[sign] == '0');
14028 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14029 buf[sign+1] == 'o');
14030 numnondigits -= 2;
14031 buf += 2;
14032 len -= 2;
14033 if (sign)
14034 buf[0] = '-';
14035 assert(len == numnondigits + numdigits);
14036 assert(numdigits > 0);
14037 }
14038
14039 /* Fill with leading zeroes to meet minimum width. */
14040 if (prec > numdigits) {
14041 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14042 numnondigits + prec);
14043 char *b1;
14044 if (!r1) {
14045 Py_DECREF(result);
14046 return NULL;
14047 }
14048 b1 = PyBytes_AS_STRING(r1);
14049 for (i = 0; i < numnondigits; ++i)
14050 *b1++ = *buf++;
14051 for (i = 0; i < prec - numdigits; i++)
14052 *b1++ = '0';
14053 for (i = 0; i < numdigits; i++)
14054 *b1++ = *buf++;
14055 *b1 = '\0';
14056 Py_DECREF(result);
14057 result = r1;
14058 buf = PyBytes_AS_STRING(result);
14059 len = numnondigits + prec;
14060 }
14061
14062 /* Fix up case for hex conversions. */
14063 if (type == 'X') {
14064 /* Need to convert all lower case letters to upper case.
14065 and need to convert 0x to 0X (and -0x to -0X). */
14066 for (i = 0; i < len; i++)
14067 if (buf[i] >= 'a' && buf[i] <= 'x')
14068 buf[i] -= 'a'-'A';
14069 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014070 if (!PyUnicode_Check(result)
14071 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014072 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014073 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014074 Py_DECREF(result);
14075 result = unicode;
14076 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014077 else if (len != PyUnicode_GET_LENGTH(result)) {
14078 if (PyUnicode_Resize(&result, len) < 0)
14079 Py_CLEAR(result);
14080 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014081 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014082}
14083
Ethan Furmandf3ed242014-01-05 06:50:30 -080014084/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014085 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014086 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014087 * -1 and raise an exception on error */
14088static int
Victor Stinnera47082312012-10-04 02:19:54 +020014089mainformatlong(PyObject *v,
14090 struct unicode_format_arg_t *arg,
14091 PyObject **p_output,
14092 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014093{
14094 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014095 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014096
14097 if (!PyNumber_Check(v))
14098 goto wrongtype;
14099
Ethan Furman9ab74802014-03-21 06:38:46 -070014100 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014101 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014102 if (type == 'o' || type == 'x' || type == 'X') {
14103 iobj = PyNumber_Index(v);
14104 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014105 if (PyErr_ExceptionMatches(PyExc_TypeError))
14106 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014107 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014108 }
14109 }
14110 else {
14111 iobj = PyNumber_Long(v);
14112 if (iobj == NULL ) {
14113 if (PyErr_ExceptionMatches(PyExc_TypeError))
14114 goto wrongtype;
14115 return -1;
14116 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014117 }
14118 assert(PyLong_Check(iobj));
14119 }
14120 else {
14121 iobj = v;
14122 Py_INCREF(iobj);
14123 }
14124
14125 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014126 && arg->width == -1 && arg->prec == -1
14127 && !(arg->flags & (F_SIGN | F_BLANK))
14128 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014129 {
14130 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014131 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014132 int base;
14133
Victor Stinnera47082312012-10-04 02:19:54 +020014134 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014135 {
14136 default:
14137 assert(0 && "'type' not in [diuoxX]");
14138 case 'd':
14139 case 'i':
14140 case 'u':
14141 base = 10;
14142 break;
14143 case 'o':
14144 base = 8;
14145 break;
14146 case 'x':
14147 case 'X':
14148 base = 16;
14149 break;
14150 }
14151
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014152 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14153 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014154 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014155 }
14156 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014157 return 1;
14158 }
14159
Ethan Furmanb95b5612015-01-23 20:05:18 -080014160 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014161 Py_DECREF(iobj);
14162 if (res == NULL)
14163 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014164 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014165 return 0;
14166
14167wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014168 switch(type)
14169 {
14170 case 'o':
14171 case 'x':
14172 case 'X':
14173 PyErr_Format(PyExc_TypeError,
14174 "%%%c format: an integer is required, "
14175 "not %.200s",
14176 type, Py_TYPE(v)->tp_name);
14177 break;
14178 default:
14179 PyErr_Format(PyExc_TypeError,
14180 "%%%c format: a number is required, "
14181 "not %.200s",
14182 type, Py_TYPE(v)->tp_name);
14183 break;
14184 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014185 return -1;
14186}
14187
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014188static Py_UCS4
14189formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014190{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014191 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014192 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014193 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014194 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014195 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014196 goto onError;
14197 }
14198 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014199 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014200 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014201 /* make sure number is a type of integer */
14202 if (!PyLong_Check(v)) {
14203 iobj = PyNumber_Index(v);
14204 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014205 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014206 }
14207 v = iobj;
14208 Py_DECREF(iobj);
14209 }
14210 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014211 x = PyLong_AsLong(v);
14212 if (x == -1 && PyErr_Occurred())
14213 goto onError;
14214
Victor Stinner8faf8212011-12-08 22:14:11 +010014215 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014216 PyErr_SetString(PyExc_OverflowError,
14217 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014218 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014219 }
14220
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014221 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014222 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014223
Benjamin Peterson29060642009-01-31 22:14:21 +000014224 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014225 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014226 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014227 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014228}
14229
Victor Stinnera47082312012-10-04 02:19:54 +020014230/* Parse options of an argument: flags, width, precision.
14231 Handle also "%(name)" syntax.
14232
14233 Return 0 if the argument has been formatted into arg->str.
14234 Return 1 if the argument has been written into ctx->writer,
14235 Raise an exception and return -1 on error. */
14236static int
14237unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14238 struct unicode_format_arg_t *arg)
14239{
14240#define FORMAT_READ(ctx) \
14241 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14242
14243 PyObject *v;
14244
Victor Stinnera47082312012-10-04 02:19:54 +020014245 if (arg->ch == '(') {
14246 /* Get argument value from a dictionary. Example: "%(name)s". */
14247 Py_ssize_t keystart;
14248 Py_ssize_t keylen;
14249 PyObject *key;
14250 int pcount = 1;
14251
14252 if (ctx->dict == NULL) {
14253 PyErr_SetString(PyExc_TypeError,
14254 "format requires a mapping");
14255 return -1;
14256 }
14257 ++ctx->fmtpos;
14258 --ctx->fmtcnt;
14259 keystart = ctx->fmtpos;
14260 /* Skip over balanced parentheses */
14261 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14262 arg->ch = FORMAT_READ(ctx);
14263 if (arg->ch == ')')
14264 --pcount;
14265 else if (arg->ch == '(')
14266 ++pcount;
14267 ctx->fmtpos++;
14268 }
14269 keylen = ctx->fmtpos - keystart - 1;
14270 if (ctx->fmtcnt < 0 || pcount > 0) {
14271 PyErr_SetString(PyExc_ValueError,
14272 "incomplete format key");
14273 return -1;
14274 }
14275 key = PyUnicode_Substring(ctx->fmtstr,
14276 keystart, keystart + keylen);
14277 if (key == NULL)
14278 return -1;
14279 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014280 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014281 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014282 }
14283 ctx->args = PyObject_GetItem(ctx->dict, key);
14284 Py_DECREF(key);
14285 if (ctx->args == NULL)
14286 return -1;
14287 ctx->args_owned = 1;
14288 ctx->arglen = -1;
14289 ctx->argidx = -2;
14290 }
14291
14292 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014293 while (--ctx->fmtcnt >= 0) {
14294 arg->ch = FORMAT_READ(ctx);
14295 ctx->fmtpos++;
14296 switch (arg->ch) {
14297 case '-': arg->flags |= F_LJUST; continue;
14298 case '+': arg->flags |= F_SIGN; continue;
14299 case ' ': arg->flags |= F_BLANK; continue;
14300 case '#': arg->flags |= F_ALT; continue;
14301 case '0': arg->flags |= F_ZERO; continue;
14302 }
14303 break;
14304 }
14305
14306 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014307 if (arg->ch == '*') {
14308 v = unicode_format_getnextarg(ctx);
14309 if (v == NULL)
14310 return -1;
14311 if (!PyLong_Check(v)) {
14312 PyErr_SetString(PyExc_TypeError,
14313 "* wants int");
14314 return -1;
14315 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014316 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014317 if (arg->width == -1 && PyErr_Occurred())
14318 return -1;
14319 if (arg->width < 0) {
14320 arg->flags |= F_LJUST;
14321 arg->width = -arg->width;
14322 }
14323 if (--ctx->fmtcnt >= 0) {
14324 arg->ch = FORMAT_READ(ctx);
14325 ctx->fmtpos++;
14326 }
14327 }
14328 else if (arg->ch >= '0' && arg->ch <= '9') {
14329 arg->width = arg->ch - '0';
14330 while (--ctx->fmtcnt >= 0) {
14331 arg->ch = FORMAT_READ(ctx);
14332 ctx->fmtpos++;
14333 if (arg->ch < '0' || arg->ch > '9')
14334 break;
14335 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14336 mixing signed and unsigned comparison. Since arg->ch is between
14337 '0' and '9', casting to int is safe. */
14338 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14339 PyErr_SetString(PyExc_ValueError,
14340 "width too big");
14341 return -1;
14342 }
14343 arg->width = arg->width*10 + (arg->ch - '0');
14344 }
14345 }
14346
14347 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014348 if (arg->ch == '.') {
14349 arg->prec = 0;
14350 if (--ctx->fmtcnt >= 0) {
14351 arg->ch = FORMAT_READ(ctx);
14352 ctx->fmtpos++;
14353 }
14354 if (arg->ch == '*') {
14355 v = unicode_format_getnextarg(ctx);
14356 if (v == NULL)
14357 return -1;
14358 if (!PyLong_Check(v)) {
14359 PyErr_SetString(PyExc_TypeError,
14360 "* wants int");
14361 return -1;
14362 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014363 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014364 if (arg->prec == -1 && PyErr_Occurred())
14365 return -1;
14366 if (arg->prec < 0)
14367 arg->prec = 0;
14368 if (--ctx->fmtcnt >= 0) {
14369 arg->ch = FORMAT_READ(ctx);
14370 ctx->fmtpos++;
14371 }
14372 }
14373 else if (arg->ch >= '0' && arg->ch <= '9') {
14374 arg->prec = arg->ch - '0';
14375 while (--ctx->fmtcnt >= 0) {
14376 arg->ch = FORMAT_READ(ctx);
14377 ctx->fmtpos++;
14378 if (arg->ch < '0' || arg->ch > '9')
14379 break;
14380 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14381 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014382 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014383 return -1;
14384 }
14385 arg->prec = arg->prec*10 + (arg->ch - '0');
14386 }
14387 }
14388 }
14389
14390 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14391 if (ctx->fmtcnt >= 0) {
14392 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14393 if (--ctx->fmtcnt >= 0) {
14394 arg->ch = FORMAT_READ(ctx);
14395 ctx->fmtpos++;
14396 }
14397 }
14398 }
14399 if (ctx->fmtcnt < 0) {
14400 PyErr_SetString(PyExc_ValueError,
14401 "incomplete format");
14402 return -1;
14403 }
14404 return 0;
14405
14406#undef FORMAT_READ
14407}
14408
14409/* Format one argument. Supported conversion specifiers:
14410
14411 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014412 - "i", "d", "u": int or float
14413 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014414 - "e", "E", "f", "F", "g", "G": float
14415 - "c": int or str (1 character)
14416
Victor Stinner8dbd4212012-12-04 09:30:24 +010014417 When possible, the output is written directly into the Unicode writer
14418 (ctx->writer). A string is created when padding is required.
14419
Victor Stinnera47082312012-10-04 02:19:54 +020014420 Return 0 if the argument has been formatted into *p_str,
14421 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014422 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014423static int
14424unicode_format_arg_format(struct unicode_formatter_t *ctx,
14425 struct unicode_format_arg_t *arg,
14426 PyObject **p_str)
14427{
14428 PyObject *v;
14429 _PyUnicodeWriter *writer = &ctx->writer;
14430
14431 if (ctx->fmtcnt == 0)
14432 ctx->writer.overallocate = 0;
14433
14434 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014435 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014436 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014437 return 1;
14438 }
14439
14440 v = unicode_format_getnextarg(ctx);
14441 if (v == NULL)
14442 return -1;
14443
Victor Stinnera47082312012-10-04 02:19:54 +020014444
14445 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014446 case 's':
14447 case 'r':
14448 case 'a':
14449 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14450 /* Fast path */
14451 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14452 return -1;
14453 return 1;
14454 }
14455
14456 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14457 *p_str = v;
14458 Py_INCREF(*p_str);
14459 }
14460 else {
14461 if (arg->ch == 's')
14462 *p_str = PyObject_Str(v);
14463 else if (arg->ch == 'r')
14464 *p_str = PyObject_Repr(v);
14465 else
14466 *p_str = PyObject_ASCII(v);
14467 }
14468 break;
14469
14470 case 'i':
14471 case 'd':
14472 case 'u':
14473 case 'o':
14474 case 'x':
14475 case 'X':
14476 {
14477 int ret = mainformatlong(v, arg, p_str, writer);
14478 if (ret != 0)
14479 return ret;
14480 arg->sign = 1;
14481 break;
14482 }
14483
14484 case 'e':
14485 case 'E':
14486 case 'f':
14487 case 'F':
14488 case 'g':
14489 case 'G':
14490 if (arg->width == -1 && arg->prec == -1
14491 && !(arg->flags & (F_SIGN | F_BLANK)))
14492 {
14493 /* Fast path */
14494 if (formatfloat(v, arg, NULL, writer) == -1)
14495 return -1;
14496 return 1;
14497 }
14498
14499 arg->sign = 1;
14500 if (formatfloat(v, arg, p_str, NULL) == -1)
14501 return -1;
14502 break;
14503
14504 case 'c':
14505 {
14506 Py_UCS4 ch = formatchar(v);
14507 if (ch == (Py_UCS4) -1)
14508 return -1;
14509 if (arg->width == -1 && arg->prec == -1) {
14510 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014511 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014512 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014513 return 1;
14514 }
14515 *p_str = PyUnicode_FromOrdinal(ch);
14516 break;
14517 }
14518
14519 default:
14520 PyErr_Format(PyExc_ValueError,
14521 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014522 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014523 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14524 (int)arg->ch,
14525 ctx->fmtpos - 1);
14526 return -1;
14527 }
14528 if (*p_str == NULL)
14529 return -1;
14530 assert (PyUnicode_Check(*p_str));
14531 return 0;
14532}
14533
14534static int
14535unicode_format_arg_output(struct unicode_formatter_t *ctx,
14536 struct unicode_format_arg_t *arg,
14537 PyObject *str)
14538{
14539 Py_ssize_t len;
14540 enum PyUnicode_Kind kind;
14541 void *pbuf;
14542 Py_ssize_t pindex;
14543 Py_UCS4 signchar;
14544 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014545 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014546 Py_ssize_t sublen;
14547 _PyUnicodeWriter *writer = &ctx->writer;
14548 Py_UCS4 fill;
14549
14550 fill = ' ';
14551 if (arg->sign && arg->flags & F_ZERO)
14552 fill = '0';
14553
14554 if (PyUnicode_READY(str) == -1)
14555 return -1;
14556
14557 len = PyUnicode_GET_LENGTH(str);
14558 if ((arg->width == -1 || arg->width <= len)
14559 && (arg->prec == -1 || arg->prec >= len)
14560 && !(arg->flags & (F_SIGN | F_BLANK)))
14561 {
14562 /* Fast path */
14563 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14564 return -1;
14565 return 0;
14566 }
14567
14568 /* Truncate the string for "s", "r" and "a" formats
14569 if the precision is set */
14570 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14571 if (arg->prec >= 0 && len > arg->prec)
14572 len = arg->prec;
14573 }
14574
14575 /* Adjust sign and width */
14576 kind = PyUnicode_KIND(str);
14577 pbuf = PyUnicode_DATA(str);
14578 pindex = 0;
14579 signchar = '\0';
14580 if (arg->sign) {
14581 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14582 if (ch == '-' || ch == '+') {
14583 signchar = ch;
14584 len--;
14585 pindex++;
14586 }
14587 else if (arg->flags & F_SIGN)
14588 signchar = '+';
14589 else if (arg->flags & F_BLANK)
14590 signchar = ' ';
14591 else
14592 arg->sign = 0;
14593 }
14594 if (arg->width < len)
14595 arg->width = len;
14596
14597 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014598 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014599 if (!(arg->flags & F_LJUST)) {
14600 if (arg->sign) {
14601 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014602 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014603 }
14604 else {
14605 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014606 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014607 }
14608 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014609 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14610 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014611 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014612 }
14613
Victor Stinnera47082312012-10-04 02:19:54 +020014614 buflen = arg->width;
14615 if (arg->sign && len == arg->width)
14616 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014617 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014618 return -1;
14619
14620 /* Write the sign if needed */
14621 if (arg->sign) {
14622 if (fill != ' ') {
14623 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14624 writer->pos += 1;
14625 }
14626 if (arg->width > len)
14627 arg->width--;
14628 }
14629
14630 /* Write the numeric prefix for "x", "X" and "o" formats
14631 if the alternate form is used.
14632 For example, write "0x" for the "%#x" format. */
14633 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14634 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14635 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14636 if (fill != ' ') {
14637 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14638 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14639 writer->pos += 2;
14640 pindex += 2;
14641 }
14642 arg->width -= 2;
14643 if (arg->width < 0)
14644 arg->width = 0;
14645 len -= 2;
14646 }
14647
14648 /* Pad left with the fill character if needed */
14649 if (arg->width > len && !(arg->flags & F_LJUST)) {
14650 sublen = arg->width - len;
14651 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14652 writer->pos += sublen;
14653 arg->width = len;
14654 }
14655
14656 /* If padding with spaces: write sign if needed and/or numeric prefix if
14657 the alternate form is used */
14658 if (fill == ' ') {
14659 if (arg->sign) {
14660 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14661 writer->pos += 1;
14662 }
14663 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14664 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14665 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14666 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14667 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14668 writer->pos += 2;
14669 pindex += 2;
14670 }
14671 }
14672
14673 /* Write characters */
14674 if (len) {
14675 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14676 str, pindex, len);
14677 writer->pos += len;
14678 }
14679
14680 /* Pad right with the fill character if needed */
14681 if (arg->width > len) {
14682 sublen = arg->width - len;
14683 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14684 writer->pos += sublen;
14685 }
14686 return 0;
14687}
14688
14689/* Helper of PyUnicode_Format(): format one arg.
14690 Return 0 on success, raise an exception and return -1 on error. */
14691static int
14692unicode_format_arg(struct unicode_formatter_t *ctx)
14693{
14694 struct unicode_format_arg_t arg;
14695 PyObject *str;
14696 int ret;
14697
Victor Stinner8dbd4212012-12-04 09:30:24 +010014698 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14699 arg.flags = 0;
14700 arg.width = -1;
14701 arg.prec = -1;
14702 arg.sign = 0;
14703 str = NULL;
14704
Victor Stinnera47082312012-10-04 02:19:54 +020014705 ret = unicode_format_arg_parse(ctx, &arg);
14706 if (ret == -1)
14707 return -1;
14708
14709 ret = unicode_format_arg_format(ctx, &arg, &str);
14710 if (ret == -1)
14711 return -1;
14712
14713 if (ret != 1) {
14714 ret = unicode_format_arg_output(ctx, &arg, str);
14715 Py_DECREF(str);
14716 if (ret == -1)
14717 return -1;
14718 }
14719
14720 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14721 PyErr_SetString(PyExc_TypeError,
14722 "not all arguments converted during string formatting");
14723 return -1;
14724 }
14725 return 0;
14726}
14727
Alexander Belopolsky40018472011-02-26 01:02:56 +000014728PyObject *
14729PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014730{
Victor Stinnera47082312012-10-04 02:19:54 +020014731 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014732
Guido van Rossumd57fd912000-03-10 22:53:23 +000014733 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014734 PyErr_BadInternalCall();
14735 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014736 }
Victor Stinnera47082312012-10-04 02:19:54 +020014737
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014738 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014739 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014740
14741 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014742 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14743 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14744 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14745 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014746
Victor Stinner8f674cc2013-04-17 23:02:17 +020014747 _PyUnicodeWriter_Init(&ctx.writer);
14748 ctx.writer.min_length = ctx.fmtcnt + 100;
14749 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014750
Guido van Rossumd57fd912000-03-10 22:53:23 +000014751 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014752 ctx.arglen = PyTuple_Size(args);
14753 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014754 }
14755 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014756 ctx.arglen = -1;
14757 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014758 }
Victor Stinnera47082312012-10-04 02:19:54 +020014759 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014760 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014761 ctx.dict = args;
14762 else
14763 ctx.dict = NULL;
14764 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014765
Victor Stinnera47082312012-10-04 02:19:54 +020014766 while (--ctx.fmtcnt >= 0) {
14767 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014768 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014769
14770 nonfmtpos = ctx.fmtpos++;
14771 while (ctx.fmtcnt >= 0 &&
14772 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14773 ctx.fmtpos++;
14774 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014775 }
Victor Stinnera47082312012-10-04 02:19:54 +020014776 if (ctx.fmtcnt < 0) {
14777 ctx.fmtpos--;
14778 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014779 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014780
Victor Stinnercfc4c132013-04-03 01:48:39 +020014781 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14782 nonfmtpos, ctx.fmtpos) < 0)
14783 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014784 }
14785 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014786 ctx.fmtpos++;
14787 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014788 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014789 }
14790 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014791
Victor Stinnera47082312012-10-04 02:19:54 +020014792 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014793 PyErr_SetString(PyExc_TypeError,
14794 "not all arguments converted during string formatting");
14795 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014796 }
14797
Victor Stinnera47082312012-10-04 02:19:54 +020014798 if (ctx.args_owned) {
14799 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014800 }
Victor Stinnera47082312012-10-04 02:19:54 +020014801 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014802
Benjamin Peterson29060642009-01-31 22:14:21 +000014803 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014804 _PyUnicodeWriter_Dealloc(&ctx.writer);
14805 if (ctx.args_owned) {
14806 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014807 }
14808 return NULL;
14809}
14810
Jeremy Hylton938ace62002-07-17 16:30:39 +000014811static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014812unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14813
Tim Peters6d6c1a32001-08-02 04:15:00 +000014814static PyObject *
14815unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14816{
Benjamin Peterson29060642009-01-31 22:14:21 +000014817 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014818 static char *kwlist[] = {"object", "encoding", "errors", 0};
14819 char *encoding = NULL;
14820 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014821
Benjamin Peterson14339b62009-01-31 16:36:08 +000014822 if (type != &PyUnicode_Type)
14823 return unicode_subtype_new(type, args, kwds);
14824 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014825 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014826 return NULL;
14827 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014828 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014829 if (encoding == NULL && errors == NULL)
14830 return PyObject_Str(x);
14831 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014832 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014833}
14834
Guido van Rossume023fe02001-08-30 03:12:59 +000014835static PyObject *
14836unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14837{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014838 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014839 Py_ssize_t length, char_size;
14840 int share_wstr, share_utf8;
14841 unsigned int kind;
14842 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014843
Benjamin Peterson14339b62009-01-31 16:36:08 +000014844 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014845
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014846 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014847 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014848 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014849 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014850 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014851 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014852 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014853 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014854
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014855 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014856 if (self == NULL) {
14857 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014858 return NULL;
14859 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014860 kind = PyUnicode_KIND(unicode);
14861 length = PyUnicode_GET_LENGTH(unicode);
14862
14863 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014864#ifdef Py_DEBUG
14865 _PyUnicode_HASH(self) = -1;
14866#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014867 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014868#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014869 _PyUnicode_STATE(self).interned = 0;
14870 _PyUnicode_STATE(self).kind = kind;
14871 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014872 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014873 _PyUnicode_STATE(self).ready = 1;
14874 _PyUnicode_WSTR(self) = NULL;
14875 _PyUnicode_UTF8_LENGTH(self) = 0;
14876 _PyUnicode_UTF8(self) = NULL;
14877 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014878 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014879
14880 share_utf8 = 0;
14881 share_wstr = 0;
14882 if (kind == PyUnicode_1BYTE_KIND) {
14883 char_size = 1;
14884 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14885 share_utf8 = 1;
14886 }
14887 else if (kind == PyUnicode_2BYTE_KIND) {
14888 char_size = 2;
14889 if (sizeof(wchar_t) == 2)
14890 share_wstr = 1;
14891 }
14892 else {
14893 assert(kind == PyUnicode_4BYTE_KIND);
14894 char_size = 4;
14895 if (sizeof(wchar_t) == 4)
14896 share_wstr = 1;
14897 }
14898
14899 /* Ensure we won't overflow the length. */
14900 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14901 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014902 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014903 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014904 data = PyObject_MALLOC((length + 1) * char_size);
14905 if (data == NULL) {
14906 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014907 goto onError;
14908 }
14909
Victor Stinnerc3c74152011-10-02 20:39:55 +020014910 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014911 if (share_utf8) {
14912 _PyUnicode_UTF8_LENGTH(self) = length;
14913 _PyUnicode_UTF8(self) = data;
14914 }
14915 if (share_wstr) {
14916 _PyUnicode_WSTR_LENGTH(self) = length;
14917 _PyUnicode_WSTR(self) = (wchar_t *)data;
14918 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014919
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014920 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014921 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014922 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014923#ifdef Py_DEBUG
14924 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14925#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014926 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014927 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014928
14929onError:
14930 Py_DECREF(unicode);
14931 Py_DECREF(self);
14932 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014933}
14934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014935PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014936"str(object='') -> str\n\
14937str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014938\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014939Create a new string object from the given object. If encoding or\n\
14940errors is specified, then the object must expose a data buffer\n\
14941that will be decoded using the given encoding and error handler.\n\
14942Otherwise, returns the result of object.__str__() (if defined)\n\
14943or repr(object).\n\
14944encoding defaults to sys.getdefaultencoding().\n\
14945errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014946
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014947static PyObject *unicode_iter(PyObject *seq);
14948
Guido van Rossumd57fd912000-03-10 22:53:23 +000014949PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014950 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014951 "str", /* tp_name */
14952 sizeof(PyUnicodeObject), /* tp_size */
14953 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014954 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014955 (destructor)unicode_dealloc, /* tp_dealloc */
14956 0, /* tp_print */
14957 0, /* tp_getattr */
14958 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014959 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014960 unicode_repr, /* tp_repr */
14961 &unicode_as_number, /* tp_as_number */
14962 &unicode_as_sequence, /* tp_as_sequence */
14963 &unicode_as_mapping, /* tp_as_mapping */
14964 (hashfunc) unicode_hash, /* tp_hash*/
14965 0, /* tp_call*/
14966 (reprfunc) unicode_str, /* tp_str */
14967 PyObject_GenericGetAttr, /* tp_getattro */
14968 0, /* tp_setattro */
14969 0, /* tp_as_buffer */
14970 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014971 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014972 unicode_doc, /* tp_doc */
14973 0, /* tp_traverse */
14974 0, /* tp_clear */
14975 PyUnicode_RichCompare, /* tp_richcompare */
14976 0, /* tp_weaklistoffset */
14977 unicode_iter, /* tp_iter */
14978 0, /* tp_iternext */
14979 unicode_methods, /* tp_methods */
14980 0, /* tp_members */
14981 0, /* tp_getset */
14982 &PyBaseObject_Type, /* tp_base */
14983 0, /* tp_dict */
14984 0, /* tp_descr_get */
14985 0, /* tp_descr_set */
14986 0, /* tp_dictoffset */
14987 0, /* tp_init */
14988 0, /* tp_alloc */
14989 unicode_new, /* tp_new */
14990 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014991};
14992
14993/* Initialize the Unicode implementation */
14994
Victor Stinner3a50e702011-10-18 21:21:00 +020014995int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014996{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014997 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014998 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014999 0x000A, /* LINE FEED */
15000 0x000D, /* CARRIAGE RETURN */
15001 0x001C, /* FILE SEPARATOR */
15002 0x001D, /* GROUP SEPARATOR */
15003 0x001E, /* RECORD SEPARATOR */
15004 0x0085, /* NEXT LINE */
15005 0x2028, /* LINE SEPARATOR */
15006 0x2029, /* PARAGRAPH SEPARATOR */
15007 };
15008
Fred Drakee4315f52000-05-09 19:53:39 +000015009 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015010 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015011 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015012 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015013 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015014
Guido van Rossumcacfc072002-05-24 19:01:59 +000015015 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015016 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015017
15018 /* initialize the linebreak bloom filter */
15019 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015020 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015021 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015022
Christian Heimes26532f72013-07-20 14:57:16 +020015023 if (PyType_Ready(&EncodingMapType) < 0)
15024 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015025
Benjamin Petersonc4311282012-10-30 23:21:10 -040015026 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15027 Py_FatalError("Can't initialize field name iterator type");
15028
15029 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15030 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015031
Victor Stinner3a50e702011-10-18 21:21:00 +020015032 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015033}
15034
15035/* Finalize the Unicode implementation */
15036
Christian Heimesa156e092008-02-16 07:38:31 +000015037int
15038PyUnicode_ClearFreeList(void)
15039{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015040 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015041}
15042
Guido van Rossumd57fd912000-03-10 22:53:23 +000015043void
Thomas Wouters78890102000-07-22 19:25:51 +000015044_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015045{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015046 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015047
Serhiy Storchaka05997252013-01-26 12:14:02 +020015048 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015049
Serhiy Storchaka05997252013-01-26 12:14:02 +020015050 for (i = 0; i < 256; i++)
15051 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015052 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015053 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015054}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015055
Walter Dörwald16807132007-05-25 13:52:07 +000015056void
15057PyUnicode_InternInPlace(PyObject **p)
15058{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015059 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015060 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015061#ifdef Py_DEBUG
15062 assert(s != NULL);
15063 assert(_PyUnicode_CHECK(s));
15064#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015065 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015066 return;
15067#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015068 /* If it's a subclass, we don't really know what putting
15069 it in the interned dict might do. */
15070 if (!PyUnicode_CheckExact(s))
15071 return;
15072 if (PyUnicode_CHECK_INTERNED(s))
15073 return;
15074 if (interned == NULL) {
15075 interned = PyDict_New();
15076 if (interned == NULL) {
15077 PyErr_Clear(); /* Don't leave an exception */
15078 return;
15079 }
15080 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015081 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015082 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015083 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015084 if (t == NULL) {
15085 PyErr_Clear();
15086 return;
15087 }
15088 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015089 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015090 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015091 return;
15092 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015093 /* The two references in interned are not counted by refcnt.
15094 The deallocator will take care of this */
15095 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015096 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015097}
15098
15099void
15100PyUnicode_InternImmortal(PyObject **p)
15101{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015102 PyUnicode_InternInPlace(p);
15103 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015104 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015105 Py_INCREF(*p);
15106 }
Walter Dörwald16807132007-05-25 13:52:07 +000015107}
15108
15109PyObject *
15110PyUnicode_InternFromString(const char *cp)
15111{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015112 PyObject *s = PyUnicode_FromString(cp);
15113 if (s == NULL)
15114 return NULL;
15115 PyUnicode_InternInPlace(&s);
15116 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015117}
15118
Alexander Belopolsky40018472011-02-26 01:02:56 +000015119void
15120_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015121{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015122 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015123 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015124 Py_ssize_t i, n;
15125 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015126
Benjamin Peterson14339b62009-01-31 16:36:08 +000015127 if (interned == NULL || !PyDict_Check(interned))
15128 return;
15129 keys = PyDict_Keys(interned);
15130 if (keys == NULL || !PyList_Check(keys)) {
15131 PyErr_Clear();
15132 return;
15133 }
Walter Dörwald16807132007-05-25 13:52:07 +000015134
Benjamin Peterson14339b62009-01-31 16:36:08 +000015135 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15136 detector, interned unicode strings are not forcibly deallocated;
15137 rather, we give them their stolen references back, and then clear
15138 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015139
Benjamin Peterson14339b62009-01-31 16:36:08 +000015140 n = PyList_GET_SIZE(keys);
15141 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015142 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015143 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015144 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015145 if (PyUnicode_READY(s) == -1) {
15146 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015147 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015148 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015149 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015150 case SSTATE_NOT_INTERNED:
15151 /* XXX Shouldn't happen */
15152 break;
15153 case SSTATE_INTERNED_IMMORTAL:
15154 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015155 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015156 break;
15157 case SSTATE_INTERNED_MORTAL:
15158 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015159 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015160 break;
15161 default:
15162 Py_FatalError("Inconsistent interned string state.");
15163 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015164 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015165 }
15166 fprintf(stderr, "total size of all interned strings: "
15167 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15168 "mortal/immortal\n", mortal_size, immortal_size);
15169 Py_DECREF(keys);
15170 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015171 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015172}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015173
15174
15175/********************* Unicode Iterator **************************/
15176
15177typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015178 PyObject_HEAD
15179 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015180 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015181} unicodeiterobject;
15182
15183static void
15184unicodeiter_dealloc(unicodeiterobject *it)
15185{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015186 _PyObject_GC_UNTRACK(it);
15187 Py_XDECREF(it->it_seq);
15188 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015189}
15190
15191static int
15192unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15193{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015194 Py_VISIT(it->it_seq);
15195 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015196}
15197
15198static PyObject *
15199unicodeiter_next(unicodeiterobject *it)
15200{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015201 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015202
Benjamin Peterson14339b62009-01-31 16:36:08 +000015203 assert(it != NULL);
15204 seq = it->it_seq;
15205 if (seq == NULL)
15206 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015207 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015209 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15210 int kind = PyUnicode_KIND(seq);
15211 void *data = PyUnicode_DATA(seq);
15212 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15213 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015214 if (item != NULL)
15215 ++it->it_index;
15216 return item;
15217 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015218
Benjamin Peterson14339b62009-01-31 16:36:08 +000015219 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015220 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015221 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015222}
15223
15224static PyObject *
15225unicodeiter_len(unicodeiterobject *it)
15226{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015227 Py_ssize_t len = 0;
15228 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015229 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015230 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015231}
15232
15233PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15234
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015235static PyObject *
15236unicodeiter_reduce(unicodeiterobject *it)
15237{
15238 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015239 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015240 it->it_seq, it->it_index);
15241 } else {
15242 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15243 if (u == NULL)
15244 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015245 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015246 }
15247}
15248
15249PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15250
15251static PyObject *
15252unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15253{
15254 Py_ssize_t index = PyLong_AsSsize_t(state);
15255 if (index == -1 && PyErr_Occurred())
15256 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015257 if (it->it_seq != NULL) {
15258 if (index < 0)
15259 index = 0;
15260 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15261 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15262 it->it_index = index;
15263 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015264 Py_RETURN_NONE;
15265}
15266
15267PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15268
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015269static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015270 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015271 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015272 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15273 reduce_doc},
15274 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15275 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015276 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015277};
15278
15279PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015280 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15281 "str_iterator", /* tp_name */
15282 sizeof(unicodeiterobject), /* tp_basicsize */
15283 0, /* tp_itemsize */
15284 /* methods */
15285 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15286 0, /* tp_print */
15287 0, /* tp_getattr */
15288 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015289 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015290 0, /* tp_repr */
15291 0, /* tp_as_number */
15292 0, /* tp_as_sequence */
15293 0, /* tp_as_mapping */
15294 0, /* tp_hash */
15295 0, /* tp_call */
15296 0, /* tp_str */
15297 PyObject_GenericGetAttr, /* tp_getattro */
15298 0, /* tp_setattro */
15299 0, /* tp_as_buffer */
15300 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15301 0, /* tp_doc */
15302 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15303 0, /* tp_clear */
15304 0, /* tp_richcompare */
15305 0, /* tp_weaklistoffset */
15306 PyObject_SelfIter, /* tp_iter */
15307 (iternextfunc)unicodeiter_next, /* tp_iternext */
15308 unicodeiter_methods, /* tp_methods */
15309 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015310};
15311
15312static PyObject *
15313unicode_iter(PyObject *seq)
15314{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015315 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015316
Benjamin Peterson14339b62009-01-31 16:36:08 +000015317 if (!PyUnicode_Check(seq)) {
15318 PyErr_BadInternalCall();
15319 return NULL;
15320 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015321 if (PyUnicode_READY(seq) == -1)
15322 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015323 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15324 if (it == NULL)
15325 return NULL;
15326 it->it_index = 0;
15327 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015328 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015329 _PyObject_GC_TRACK(it);
15330 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015331}
15332
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015333
15334size_t
15335Py_UNICODE_strlen(const Py_UNICODE *u)
15336{
15337 int res = 0;
15338 while(*u++)
15339 res++;
15340 return res;
15341}
15342
15343Py_UNICODE*
15344Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15345{
15346 Py_UNICODE *u = s1;
15347 while ((*u++ = *s2++));
15348 return s1;
15349}
15350
15351Py_UNICODE*
15352Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15353{
15354 Py_UNICODE *u = s1;
15355 while ((*u++ = *s2++))
15356 if (n-- == 0)
15357 break;
15358 return s1;
15359}
15360
15361Py_UNICODE*
15362Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15363{
15364 Py_UNICODE *u1 = s1;
15365 u1 += Py_UNICODE_strlen(u1);
15366 Py_UNICODE_strcpy(u1, s2);
15367 return s1;
15368}
15369
15370int
15371Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15372{
15373 while (*s1 && *s2 && *s1 == *s2)
15374 s1++, s2++;
15375 if (*s1 && *s2)
15376 return (*s1 < *s2) ? -1 : +1;
15377 if (*s1)
15378 return 1;
15379 if (*s2)
15380 return -1;
15381 return 0;
15382}
15383
15384int
15385Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15386{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015387 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015388 for (; n != 0; n--) {
15389 u1 = *s1;
15390 u2 = *s2;
15391 if (u1 != u2)
15392 return (u1 < u2) ? -1 : +1;
15393 if (u1 == '\0')
15394 return 0;
15395 s1++;
15396 s2++;
15397 }
15398 return 0;
15399}
15400
15401Py_UNICODE*
15402Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15403{
15404 const Py_UNICODE *p;
15405 for (p = s; *p; p++)
15406 if (*p == c)
15407 return (Py_UNICODE*)p;
15408 return NULL;
15409}
15410
15411Py_UNICODE*
15412Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15413{
15414 const Py_UNICODE *p;
15415 p = s + Py_UNICODE_strlen(s);
15416 while (p != s) {
15417 p--;
15418 if (*p == c)
15419 return (Py_UNICODE*)p;
15420 }
15421 return NULL;
15422}
Victor Stinner331ea922010-08-10 16:37:20 +000015423
Victor Stinner71133ff2010-09-01 23:43:53 +000015424Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015425PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015426{
Victor Stinner577db2c2011-10-11 22:12:48 +020015427 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015428 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015430 if (!PyUnicode_Check(unicode)) {
15431 PyErr_BadArgument();
15432 return NULL;
15433 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015434 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015435 if (u == NULL)
15436 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015437 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015438 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015439 PyErr_NoMemory();
15440 return NULL;
15441 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015442 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015443 size *= sizeof(Py_UNICODE);
15444 copy = PyMem_Malloc(size);
15445 if (copy == NULL) {
15446 PyErr_NoMemory();
15447 return NULL;
15448 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015449 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015450 return copy;
15451}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015452
Georg Brandl66c221e2010-10-14 07:04:07 +000015453/* A _string module, to export formatter_parser and formatter_field_name_split
15454 to the string.Formatter class implemented in Python. */
15455
15456static PyMethodDef _string_methods[] = {
15457 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15458 METH_O, PyDoc_STR("split the argument as a field name")},
15459 {"formatter_parser", (PyCFunction) formatter_parser,
15460 METH_O, PyDoc_STR("parse the argument as a format string")},
15461 {NULL, NULL}
15462};
15463
15464static struct PyModuleDef _string_module = {
15465 PyModuleDef_HEAD_INIT,
15466 "_string",
15467 PyDoc_STR("string helper module"),
15468 0,
15469 _string_methods,
15470 NULL,
15471 NULL,
15472 NULL,
15473 NULL
15474};
15475
15476PyMODINIT_FUNC
15477PyInit__string(void)
15478{
15479 return PyModule_Create(&_string_module);
15480}
15481
15482
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015483#ifdef __cplusplus
15484}
15485#endif